Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
71d33617
Commit
71d33617
authored
Apr 24, 2015
by
John Chodera (MSKCC)
Browse files
Merge remote-tracking branch 'upstream/master'
parents
eb232608
9da36463
Changes
127
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
719 additions
and
333 deletions
+719
-333
platforms/opencl/include/OpenCLParallelKernels.h
platforms/opencl/include/OpenCLParallelKernels.h
+8
-1
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+4
-5
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+136
-30
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+39
-141
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+110
-48
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+5
-0
platforms/opencl/src/OpenCLPlatform.cpp
platforms/opencl/src/OpenCLPlatform.cpp
+5
-7
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+6
-3
platforms/opencl/src/kernels/customIntegratorGlobal.cl
platforms/opencl/src/kernels/customIntegratorGlobal.cl
+1
-1
platforms/opencl/src/kernels/customIntegratorPerDof.cl
platforms/opencl/src/kernels/customIntegratorPerDof.cl
+1
-1
platforms/opencl/src/kernels/customManyParticle.cl
platforms/opencl/src/kernels/customManyParticle.cl
+3
-1
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+33
-2
platforms/opencl/src/kernels/fftR2C.cl
platforms/opencl/src/kernels/fftR2C.cl
+166
-0
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+19
-20
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+16
-17
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+7
-8
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+68
-28
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+0
-6
platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
+58
-1
platforms/opencl/tests/TestOpenCLFFT.cpp
platforms/opencl/tests/TestOpenCLFFT.cpp
+34
-13
No files found.
platforms/opencl/include/OpenCLParallelKernels.h
View file @
71d33617
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -344,6 +344,13 @@ public:
* @return the potential energy due to the force
*/
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CMAPTorsionForce to copy the parameters from
*/
void
copyParametersToContext
(
ContextImpl
&
context
,
const
CMAPTorsionForce
&
force
);
private:
class
Task
;
OpenCLPlatform
::
PlatformData
&
data
;
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
71d33617
...
...
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one
if
(
i
!=
deviceIndex
&&
deviceIndex
>=
0
&&
deviceIndex
<
(
int
)
devices
.
size
())
continue
;
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
||
devices
[
i
].
getInfo
<
CL_DEVICE_VENDOR
>
()
==
"AMD"
))
continue
;
// The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
))
continue
;
// The CPU device on OS X won't work correctly.
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
processingElementsPerComputeUnit
=
8
;
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
...
...
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
if
(
platformVendor
.
size
()
>=
5
&&
platformVendor
.
substr
(
0
,
5
)
==
"Intel"
)
defaultOptimizationOptions
=
""
;
else
if
(
platformVendor
==
"Apple"
)
defaultOptimizationOptions
=
"-cl-mad-enable -cl-no-signed-zeros"
;
else
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
...
...
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
else
simdWidth
=
1
;
if
(
platformVendor
==
"Apple"
&&
vendor
==
"AMD"
)
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
""
;
if
(
supports64BitGlobalAtomics
)
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
71d33617
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -35,25 +35,109 @@
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
);
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
packRealAsComplex
=
false
;
int
packedXSize
=
xsize
;
int
packedYSize
=
ysize
;
int
packedZSize
=
zsize
;
if
(
realToComplex
)
{
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex
=
true
;
int
packedAxis
,
bufferSize
;
if
(
xsize
%
2
==
0
)
{
packedAxis
=
0
;
packedXSize
/=
2
;
bufferSize
=
packedXSize
;
}
else
if
(
ysize
%
2
==
0
)
{
packedAxis
=
1
;
packedYSize
/=
2
;
bufferSize
=
packedYSize
;
}
else
if
(
zsize
%
2
==
0
)
{
packedAxis
=
2
;
packedZSize
/=
2
;
bufferSize
=
packedZSize
;
}
else
packRealAsComplex
=
false
;
if
(
packRealAsComplex
)
{
// Build the kernels for packing and unpacking the data.
map
<
string
,
string
>
defines
;
defines
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
defines
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
defines
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
defines
[
"PACKED_AXIS"
]
=
context
.
intToString
(
packedAxis
);
defines
[
"PACKED_XSIZE"
]
=
context
.
intToString
(
packedXSize
);
defines
[
"PACKED_YSIZE"
]
=
context
.
intToString
(
packedYSize
);
defines
[
"PACKED_ZSIZE"
]
=
context
.
intToString
(
packedZSize
);
defines
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
cl
::
Program
program
=
context
.
createProgram
(
OpenCLKernelSources
::
fftR2C
,
defines
);
packForwardKernel
=
cl
::
Kernel
(
program
,
"packForwardData"
);
unpackForwardKernel
=
cl
::
Kernel
(
program
,
"unpackForwardData"
);
unpackForwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
packBackwardKernel
=
cl
::
Kernel
(
program
,
"packBackwardData"
);
packBackwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
unpackBackwardKernel
=
cl
::
Kernel
(
program
,
"unpackBackwardData"
);
}
}
bool
inputIsReal
=
(
realToComplex
&&
!
packRealAsComplex
);
zkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
true
,
inputIsReal
);
xkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
true
,
inputIsReal
);
ykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
true
,
inputIsReal
);
invzkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
false
,
inputIsReal
);
invxkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
false
,
inputIsReal
);
invykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
false
,
inputIsReal
);
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zthreads
);
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xthreads
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
ythreads
);
cl
::
Kernel
kernel1
=
(
forward
?
zkernel
:
invzkernel
);
cl
::
Kernel
kernel2
=
(
forward
?
xkernel
:
invxkernel
);
cl
::
Kernel
kernel3
=
(
forward
?
ykernel
:
invykernel
);
if
(
packRealAsComplex
)
{
cl
::
Kernel
packKernel
=
(
forward
?
packForwardKernel
:
packBackwardKernel
);
cl
::
Kernel
unpackKernel
=
(
forward
?
unpackForwardKernel
:
unpackBackwardKernel
);
int
gridSize
=
xsize
*
ysize
*
zsize
/
2
;
// Pack the data into a half sized grid.
packKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
packKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
packKernel
,
gridSize
);
// Perform the FFT.
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
gridSize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
gridSize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
gridSize
,
ythreads
);
// Unpack the data.
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
unpackKernel
,
gridSize
);
}
else
{
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
xsize
*
ysize
*
zsize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
xsize
*
ysize
*
zsize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
...
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
)
{
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
while
(
maxThreads
>
128
&&
maxThreads
-
64
>=
zsize
)
maxThreads
-=
64
;
bool
isCPU
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
;
while
(
true
)
{
bool
loopRequired
=
(
zsize
>
maxThreads
||
isCPU
);
...
...
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
sign
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 b5 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
(SIGN)
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
...
...
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 =
sign
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
sign
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"real2 d9 =
(SIGN)
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
(SIGN)
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
...
...
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 =
sign
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"real2 d3 =
(SIGN)
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
...
...
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 =
sign
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"real2 d2 =
(SIGN)
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
...
...
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
)
{
if
(
outputIsPacked
)
source
<<
"if (x < XSIZE/2+1)
\n
"
;
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z];
\n
"
;
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
if
(
outputIsPacked
)
{
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
}
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
...
...
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_PACKED"
]
=
(
inputIsReal
&&
axis
==
0
&&
!
forward
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
...
...
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue
;
}
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
kernel
.
setArg
(
2
,
bufferSize
,
NULL
);
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
5
,
bufferSize
,
NULL
);
return
kernel
;
}
}
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
View file @
71d33617
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
4
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -32,6 +32,7 @@
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "OpenCLExpressionUtilities.h"
#include "ReferenceCCMAAlgorithm.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
...
...
@@ -323,157 +324,54 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
int
numCCMA
=
(
int
)
ccmaConstraints
.
size
();
if
(
numCCMA
>
0
)
{
vector
<
vector
<
int
>
>
atomConstraints
(
context
.
getNumAtoms
());
// Record information needed by ReferenceCCMAAlgorithm.
vector
<
pair
<
int
,
int
>
>
refIndices
(
numCCMA
);
vector
<
RealOpenMM
>
refDistance
(
numCCMA
);
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
atomConstraints
[
atom1
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
atomConstraints
[
atom2
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
}
vector
<
vector
<
int
>
>
linkedConstraints
(
numCCMA
);
for
(
unsigned
atom
=
0
;
atom
<
atomConstraints
.
size
();
atom
++
)
{
for
(
unsigned
i
=
0
;
i
<
atomConstraints
[
atom
].
size
();
i
++
)
for
(
unsigned
j
=
0
;
j
<
i
;
j
++
)
{
int
c1
=
atomConstraints
[
atom
][
i
];
int
c2
=
atomConstraints
[
atom
][
j
];
linkedConstraints
[
c1
].
push_back
(
c2
);
linkedConstraints
[
c2
].
push_back
(
c1
);
}
int
index
=
ccmaConstraints
[
i
];
refIndices
[
i
]
=
make_pair
(
atom1
[
index
],
atom2
[
index
]);
refDistance
[
i
]
=
distance
[
index
];
}
int
maxLinks
=
0
;
for
(
unsigned
i
=
0
;
i
<
linkedConstraints
.
size
();
i
++
)
maxLinks
=
max
(
maxLinks
,
(
int
)
linkedConstraints
[
i
].
size
());
int
maxAtomConstraints
=
0
;
for
(
unsigned
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
maxAtomConstraints
=
max
(
maxAtomConstraints
,
(
int
)
atomConstraints
[
i
].
size
());
// Compute the constraint coupling matrix
vector
<
vector
<
int
>
>
atomAngles
(
numAtoms
);
HarmonicAngleForce
const
*
angleForce
=
NULL
;
for
(
int
i
=
0
;
i
<
system
.
getNumForces
()
&&
angleForce
==
NULL
;
i
++
)
angleForce
=
dynamic_cast
<
HarmonicAngleForce
const
*>
(
&
system
.
getForce
(
i
));
if
(
angleForce
!=
NULL
)
for
(
int
i
=
0
;
i
<
angleForce
->
getNumAngles
();
i
++
)
{
int
particle1
,
particle2
,
particle3
;
double
angle
,
k
;
angleForce
->
getAngleParameters
(
i
,
particle1
,
particle2
,
particle3
,
angle
,
k
);
atomAngles
[
particle2
].
push_back
(
i
);
}
vector
<
vector
<
pair
<
int
,
double
>
>
>
matrix
(
numCCMA
);
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
{
for
(
int
k
=
0
;
k
<
numCCMA
;
k
++
)
{
if
(
j
==
k
)
{
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
j
,
1.0
));
continue
;
}
double
scale
;
int
cj
=
ccmaConstraints
[
j
];
int
ck
=
ccmaConstraints
[
k
];
int
atomj0
=
atom1
[
cj
];
int
atomj1
=
atom2
[
cj
];
int
atomk0
=
atom1
[
ck
];
int
atomk1
=
atom2
[
ck
];
int
atoma
,
atomb
,
atomc
;
double
imj0
=
1.0
/
system
.
getParticleMass
(
atomj0
);
double
imj1
=
1.0
/
system
.
getParticleMass
(
atomj1
);
if
(
atomj0
==
atomk0
)
{
atoma
=
atomj1
;
atomb
=
atomj0
;
atomc
=
atomk1
;
scale
=
imj0
/
(
imj0
+
imj1
);
}
else
if
(
atomj1
==
atomk1
)
{
atoma
=
atomj0
;
atomb
=
atomj1
;
atomc
=
atomk0
;
scale
=
imj1
/
(
imj0
+
imj1
);
}
else
if
(
atomj0
==
atomk1
)
{
atoma
=
atomj1
;
atomb
=
atomj0
;
atomc
=
atomk0
;
scale
=
imj0
/
(
imj0
+
imj1
);
}
else
if
(
atomj1
==
atomk0
)
{
atoma
=
atomj0
;
atomb
=
atomj1
;
atomc
=
atomk1
;
scale
=
imj1
/
(
imj0
+
imj1
);
}
else
continue
;
// These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
vector
<
RealOpenMM
>
refMasses
(
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
++
i
)
refMasses
[
i
]
=
(
RealOpenMM
)
system
.
getParticleMass
(
i
);
bool
foundConstraint
=
false
;
for
(
int
m
=
0
;
m
<
numCCMA
;
m
++
)
{
int
other
=
ccmaConstraints
[
m
];
if
((
atom1
[
other
]
==
atoma
&&
atom2
[
other
]
==
atomc
)
||
(
atom1
[
other
]
==
atomc
&&
atom2
[
other
]
==
atoma
))
{
double
d1
=
distance
[
cj
];
double
d2
=
distance
[
ck
];
double
d3
=
distance
[
other
];
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
k
,
scale
*
(
d1
*
d1
+
d2
*
d2
-
d3
*
d3
)
/
(
2.0
*
d1
*
d2
)));
foundConstraint
=
true
;
break
;
}
}
if
(
!
foundConstraint
&&
angleForce
!=
NULL
)
{
// We didn't find one, so look for an angle force field term.
const
vector
<
int
>&
angleCandidates
=
atomAngles
[
atomb
];
for
(
vector
<
int
>::
const_iterator
iter
=
angleCandidates
.
begin
();
iter
!=
angleCandidates
.
end
();
iter
++
)
{
int
particle1
,
particle2
,
particle3
;
double
angle
,
ka
;
angleForce
->
getAngleParameters
(
*
iter
,
particle1
,
particle2
,
particle3
,
angle
,
ka
);
if
((
particle1
==
atoma
&&
particle3
==
atomc
)
||
(
particle3
==
atoma
&&
particle1
==
atomc
))
{
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
k
,
scale
*
cos
(
angle
)));
break
;
}
}
// Look up angles for CCMA.
vector
<
ReferenceCCMAAlgorithm
::
AngleInfo
>
angles
;
for
(
int
i
=
0
;
i
<
system
.
getNumForces
();
i
++
)
{
const
HarmonicAngleForce
*
force
=
dynamic_cast
<
const
HarmonicAngleForce
*>
(
&
system
.
getForce
(
i
));
if
(
force
!=
NULL
)
{
for
(
int
j
=
0
;
j
<
force
->
getNumAngles
();
j
++
)
{
int
atom1
,
atom2
,
atom3
;
double
angle
,
k
;
force
->
getAngleParameters
(
j
,
atom1
,
atom2
,
atom3
,
angle
,
k
);
angles
.
push_back
(
ReferenceCCMAAlgorithm
::
AngleInfo
(
atom1
,
atom2
,
atom3
,
(
RealOpenMM
)
angle
));
}
}
}
// Invert it using QR.
vector
<
int
>
matrixRowStart
;
vector
<
int
>
matrixColIndex
;
vector
<
double
>
matrixValue
;
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
matrixRowStart
.
push_back
(
matrixValue
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
matrix
[
i
].
size
();
j
++
)
{
pair
<
int
,
double
>
element
=
matrix
[
i
][
j
];
matrixColIndex
.
push_back
(
element
.
first
);
matrixValue
.
push_back
(
element
.
second
);
}
}
matrixRowStart
.
push_back
(
matrixValue
.
size
());
int
*
qRowStart
,
*
qColIndex
,
*
rRowStart
,
*
rColIndex
;
double
*
qValue
,
*
rValue
;
int
result
=
QUERN_compute_qr
(
numCCMA
,
numCCMA
,
&
matrixRowStart
[
0
],
&
matrixColIndex
[
0
],
&
matrixValue
[
0
],
NULL
,
&
qRowStart
,
&
qColIndex
,
&
qValue
,
&
rRowStart
,
&
rColIndex
,
&
rValue
);
vector
<
double
>
rhs
(
numCCMA
);
matrix
.
clear
();
matrix
.
resize
(
numCCMA
);
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
// Extract column i of the inverse matrix.
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
rhs
[
j
]
=
(
i
==
j
?
1.0
:
0.0
);
result
=
QUERN_multiply_with_q_transpose
(
numCCMA
,
qRowStart
,
qColIndex
,
qValue
,
&
rhs
[
0
]);
result
=
QUERN_solve_with_r
(
numCCMA
,
rRowStart
,
rColIndex
,
rValue
,
&
rhs
[
0
],
&
rhs
[
0
]);
for
(
int
j
=
0
;
j
<
numCCMA
;
j
++
)
{
double
value
=
rhs
[
j
]
*
distance
[
ccmaConstraints
[
i
]]
/
distance
[
ccmaConstraints
[
j
]];
if
(
abs
(
value
)
>
0.1
)
matrix
[
j
].
push_back
(
pair
<
int
,
double
>
(
i
,
value
));
}
}
QUERN_free_result
(
qRowStart
,
qColIndex
,
qValue
);
QUERN_free_result
(
rRowStart
,
rColIndex
,
rValue
);
// Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us.
ReferenceCCMAAlgorithm
ccma
(
numAtoms
,
numCCMA
,
refIndices
,
refDistance
,
refMasses
,
angles
,
0.1
);
vector
<
vector
<
pair
<
int
,
double
>
>
>
matrix
=
ccma
.
getMatrix
();
int
maxRowElements
=
0
;
for
(
unsigned
i
=
0
;
i
<
matrix
.
size
();
i
++
)
maxRowElements
=
max
(
maxRowElements
,
(
int
)
matrix
[
i
].
size
());
maxRowElements
++
;
// Build the list of constraints for each atom.
vector
<
vector
<
int
>
>
atomConstraints
(
context
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
atomConstraints
[
atom1
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
atomConstraints
[
atom2
[
ccmaConstraints
[
i
]]].
push_back
(
i
);
}
int
maxAtomConstraints
=
0
;
for
(
unsigned
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
maxAtomConstraints
=
max
(
maxAtomConstraints
,
(
int
)
atomConstraints
[
i
].
size
());
// Sort the constraints.
vector
<
int
>
constraintOrder
(
numCCMA
);
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
71d33617
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
71d33617
...
...
@@ -492,6 +492,11 @@ double OpenCLParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, b
return
0.0
;
}
void
OpenCLParallelCalcCMAPTorsionForceKernel
::
copyParametersToContext
(
ContextImpl
&
context
,
const
CMAPTorsionForce
&
force
)
{
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
copyParametersToContext
(
context
,
force
);
}
class
OpenCLParallelCalcCustomTorsionForceKernel
::
Task
:
public
OpenCLContext
::
WorkTask
{
public:
Task
(
ContextImpl
&
context
,
OpenCLCalcCustomTorsionForceKernel
&
kernel
,
bool
includeForce
,
...
...
platforms/opencl/src/OpenCLPlatform.cpp
View file @
71d33617
...
...
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool
OpenCLPlatform
::
isPlatformSupported
()
{
// Return false for OpenCL implementations that are known
// to be buggy (Apple OSX
since 10.7.5)
// to be buggy (Apple OS
X
prior to 10.10).
#ifdef __APPLE__
char
str
[
256
];
...
...
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if
(
sscanf
(
str
,
"%d.%d.%d"
,
&
major
,
&
minor
,
&
micro
)
!=
3
)
return
false
;
if
((
major
>
11
)
||
(
major
==
11
&&
minor
>
4
)
||
(
major
==
11
&&
minor
==
4
&&
micro
>=
2
))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
// point at which a number of serious bugs were introduced into the
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
if
(
major
<
14
||
(
major
==
14
&&
minor
<
3
))
// 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// contained a number of serious bugs in the Apple OpenCL libraries.
// (See https://github.com/SimTk/openmm/issues/395 for example.)
return
false
;
#endif
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
71d33617
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
trait
->
getDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
...
...
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
isShortList
=
(
length
<=
maxLocalBuffer
&&
length
<
maxShortListSize
);
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
platforms/opencl/src/kernels/customIntegratorGlobal.cl
View file @
71d33617
__kernel
void
computeGlobal
(
__global
mixed2*
restrict
dt,
__global
mixed*
restrict
globals,
__global
mixed*
restrict
params,
float
uniform,
float
gaussian,
__global
const
real
*
restrict
energy
)
{
float
uniform,
float
gaussian,
const
real
energy
)
{
COMPUTE_STEP
}
platforms/opencl/src/kernels/customIntegratorPerDof.cl
View file @
71d33617
...
...
@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
__kernel
void
computePerDof
(
__global
real4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
__global
mixed4*
restrict
posDelta,
__global
mixed4*
restrict
velm,
__global
const
real4*
restrict
force,
__global
const
mixed2*
restrict
dt,
__global
const
mixed*
restrict
globals,
__global
const
mixed*
restrict
params,
__global
mixed*
restrict
sum,
__global
const
float4*
restrict
gaussianValues,
unsigned
int
gaussianBaseIndex,
__global
const
float4*
restrict
uniformValues,
__global
const
real
*
restrict
energy
unsigned
int
gaussianBaseIndex,
__global
const
float4*
restrict
uniformValues,
const
real
energy
PARAMETER_ARGUMENTS
)
{
mixed
stepSize
=
dt[0].y
;
int
index
=
get_global_id
(
0
)
;
...
...
platforms/opencl/src/kernels/customManyParticle.cl
View file @
71d33617
...
...
@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int
start
=
block2*TILE_SIZE
;
int
included[TILE_SIZE]
;
int
numIncluded
=
0
;
SYNC_WARPS
;
positionCache[get_local_id
(
0
)
]
=
posq[start+indexInWarp]
;
SYNC_WARPS
;
if
(
atom1
<
NUM_ATOMS
)
{
for
(
int
j
=
0
; j < 32; j++) {
int
atom2
=
start+j
;
...
...
@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor
unsigned
int
globalIndex
=
startAtom+get_local_id
(
0
)
;
posBuffer[get_local_id
(
0
)
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom[globalIndex]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Perform
a
parallel
prefix
sum.
...
...
platforms/opencl/src/kernels/fft.cl
View file @
71d33617
...
...
@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return
(
real2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
}
/**
*
Load
a
value
from
the
half-complex
grid
produces
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*/
__kernel
void
execFFT
(
__global
const
real2
*
restrict
in,
__global
real2
*
restrict
out,
int
sign,
__local
real2*
restrict
w,
__kernel
void
execFFT
(
__global
const
INPUT_TYPE
*
restrict
in,
__global
OUTPUT_TYPE
*
restrict
out,
__local
real2*
restrict
w,
__local
real2*
restrict
data0,
__local
real2*
restrict
data1
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
-
sign
*i*2*M_PI/ZSIZE
)
,
sin
(
-
sign
*i*2*M_PI/ZSIZE
))
;
w[i]
=
(
real2
)
(
cos
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
)
,
sin
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
baseIndex
=
get_group_id
(
0
)
*BLOCKS_PER_GROUP
; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int
index
=
baseIndex+get_local_id
(
0
)
/ZSIZE
;
int
x
=
index/YSIZE
;
int
y
=
index-x*YSIZE
;
#
if
OUTPUT_IS_PACKED
if
(
x
<
XSIZE/2+1
)
{
#
endif
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
#
if
INPUT_IS_REAL
data0[z]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z],
0
)
;
#
elif
INPUT_IS_PACKED
data0[z]
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
else
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
endif
#
else
if
(
index
<
XSIZE*YSIZE
)
#
if
INPUT_IS_REAL
data0[get_local_id
(
0
)
]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE],
0
)
;
#
elif
INPUT_IS_PACKED
data0[get_local_id
(
0
)
]
=
loadComplexValue
(
in,
x,
y,
get_local_id
(
0
)
%ZSIZE
)
;
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE]
;
#
endif
#
endif
#
if
OUTPUT_IS_PACKED
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
...
...
platforms/opencl/src/kernels/fftR2C.cl
0 → 100644
View file @
71d33617
/**
*
Combine
the
two
halves
of
a
real
grid
into
a
complex
grid
that
is
half
as
large.
*/
__kernel
void
packForwardData
(
__global
const
real*
restrict
in,
__global
real2*
restrict
out
)
{
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
#
if
PACKED_AXIS
==
0
real2
value
=
(
real2
)
(
in[2*x*YSIZE*ZSIZE+y*ZSIZE+z],
in[
(
2*x+1
)
*YSIZE*ZSIZE+y*
ZSIZE+z]
)
;
#
elif
PACKED_AXIS
==
1
real2
value
=
(
real2
)
(
in[x*YSIZE*ZSIZE+2*y*ZSIZE+z],
in[x*YSIZE*ZSIZE+
(
2*y+1
)
*ZSIZE+z]
)
;
#
else
real2
value
=
(
real2
)
(
in[x*YSIZE*ZSIZE+y*ZSIZE+2*z],
in[x*YSIZE*ZSIZE+y*ZSIZE+
(
2*z+1
)
]
)
;
#
endif
out[index]
=
value
;
}
}
/**
*
Split
the
transformed
data
back
into
a
full
sized,
symmetric
grid.
*/
__kernel
void
unpackForwardData
(
__global
const
real2*
restrict
in,
__global
real2*
restrict
out,
__local
real2*
restrict
w
)
{
//
Compute
the
phase
factors.
#
if
PACKED_AXIS
==
0
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_XSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/XSIZE
)
,
cos
(
i*2*M_PI/XSIZE
))
;
#
elif
PACKED_AXIS
==
1
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_YSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/YSIZE
)
,
cos
(
i*2*M_PI/YSIZE
))
;
#
else
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/ZSIZE
)
,
cos
(
i*2*M_PI/ZSIZE
))
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Transform
the
data.
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
const
int
outputZSize
=
ZSIZE/2+1
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
real2
z1
=
in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z]
;
real2
z2
=
in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp]
;
#
if
PACKED_AXIS
==
0
real2
wfac
=
w[x]
;
#
elif
PACKED_AXIS
==
1
real2
wfac
=
w[y]
;
#
else
real2
wfac
=
w[z]
;
#
endif
real2
output
=
(
real2
)
((
z1.x+z2.x
-
wfac.x*
(
z1.x-z2.x
)
+
wfac.y*
(
z1.y+z2.y
))
/2,
(
z1.y-z2.y
-
wfac.y*
(
z1.x-z2.x
)
-
wfac.x*
(
z1.y+z2.y
))
/2
)
;
if
(
z
<
outputZSize
)
out[x*YSIZE*outputZSize+y*outputZSize+z]
=
output
;
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
zp
=
(
z
==
0
?
0
:
ZSIZE-z
)
;
if
(
zp
<
outputZSize
)
{
#
if
PACKED_AXIS
==
0
if
(
x
==
0
)
out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
elif
PACKED_AXIS
==
1
if
(
y
==
0
)
out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
else
if
(
z
==
0
)
out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
endif
else
out[xp*YSIZE*outputZSize+yp*outputZSize+zp]
=
(
real2
)
(
output.x,
-output.y
)
;
}
}
}
/**
*
Load
a
value
from
the
half-complex
grid
produced
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
*
Repack
the
symmetric
complex
grid
into
one
half
as
large
in
preparation
for
doing
an
inverse
complex-to-real
transform.
*/
__kernel
void
packBackwardData
(
__global
const
real2*
restrict
in,
__global
real2*
restrict
out,
__local
real2*
restrict
w
)
{
//
Compute
the
phase
factors.
#
if
PACKED_AXIS
==
0
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_XSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/XSIZE
)
,
sin
(
i*2*M_PI/XSIZE
))
;
#
elif
PACKED_AXIS
==
1
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_YSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/YSIZE
)
,
sin
(
i*2*M_PI/YSIZE
))
;
#
else
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/ZSIZE
)
,
sin
(
i*2*M_PI/ZSIZE
))
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Transform
the
data.
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
real2
z1
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
if
PACKED_AXIS
==
0
real2
wfac
=
w[x]
;
real2
z2
=
loadComplexValue
(
in,
PACKED_XSIZE-x,
yp,
zp
)
;
#
elif
PACKED_AXIS
==
1
real2
wfac
=
w[y]
;
real2
z2
=
loadComplexValue
(
in,
xp,
PACKED_YSIZE-y,
zp
)
;
#
else
real2
wfac
=
w[z]
;
real2
z2
=
loadComplexValue
(
in,
xp,
yp,
PACKED_ZSIZE-z
)
;
#
endif
real2
even
=
(
real2
)
((
z1.x+z2.x
)
/2,
(
z1.y-z2.y
)
/2
)
;
real2
odd
=
(
real2
)
((
z1.x-z2.x
)
/2,
(
z1.y+z2.y
)
/2
)
;
odd
=
(
real2
)
(
odd.x*wfac.x-odd.y*wfac.y,
odd.y*wfac.x+odd.x*wfac.y
)
;
out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z]
=
(
real2
)
(
even.x-odd.y,
even.y+odd.x
)
;
}
}
/**
*
Split
the
data
back
into
a
full
sized,
real
grid
after
an
inverse
transform.
*/
__kernel
void
unpackBackwardData
(
__global
const
real2*
restrict
in,
__global
real*
restrict
out
)
{
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
real2
value
=
2*in[index]
;
#
if
PACKED_AXIS
==
0
out[2*x*YSIZE*ZSIZE+y*ZSIZE+z]
=
value.x
;
out[
(
2*x+1
)
*YSIZE*ZSIZE+y*
ZSIZE+z]
=
value.y
;
#
elif
PACKED_AXIS
==
1
out[x*YSIZE*ZSIZE+2*y*ZSIZE+z]
=
value.x
;
out[x*YSIZE*ZSIZE+
(
2*y+1
)
*ZSIZE+z]
=
value.y
;
#
else
out[x*YSIZE*ZSIZE+y*ZSIZE+2*z]
=
value.x
;
out[x*YSIZE*ZSIZE+y*ZSIZE+
(
2*z+1
)
]
=
value.y
;
#
endif
}
}
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
71d33617
...
...
@@ -5,16 +5,15 @@
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq
,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ
,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__global
real2*
restrict
sortedBlocks
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
real4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.xyz
-=
floor
(
pos.xyz*invPeriodicBoxSize.xyz
)
*periodicBoxSize.xyz
;
real4
firstPoint
=
pos
;
APPLY_PERIODIC_TO_POS
(
pos
)
#
endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
...
...
@@ -22,7 +21,8 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
pos.xyz
-=
floor
((
pos.xyz-firstPoint.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
real4
center
=
0.5f*
(
maxPos+minPos
)
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
center
)
#
endif
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
...
...
@@ -44,7 +44,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
__kernel
void
sortBoxData
(
__global
const
real2*
restrict
sortedBlock,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
real4*
restrict
sortedBlockCenter,
__global
real4*
restrict
sortedBlockBoundingBox,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
oldPositions,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
)
{
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
,
int
forceRebuild
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_BLOCKS; i += get_global_size(0)) {
int
index
=
(
int
)
sortedBlock[i].y
;
sortedBlockCenter[i]
=
blockCenter[index]
;
...
...
@@ -53,7 +53,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
//
Also
check
whether
any
atom
has
moved
enough
so
that
we
really
need
to
rebuild
the
neighbor
list.
bool
rebuild
=
f
alse
;
bool
rebuild
=
f
orceRebuild
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
real4
delta
=
oldPositions[i]-posq[i]
;
if
(
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
>
0.25f*PADDING*PADDING
)
...
...
@@ -70,8 +70,8 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
*
to
global
memory.
*/
void
storeInteractionData
(
unsigned
short
x,
unsigned
short*
buffer,
int*
atoms,
int*
numAtoms,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
int*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
real4*
posq,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
__global
int*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
const
real4*
posq,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
real4
posBuffer[TILE_SIZE]
;
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
...
...
@@ -83,7 +83,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
}
#
endif
posBuffer[i]
=
pos
;
...
...
@@ -99,14 +99,14 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
real4
pos
=
posq[atom]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
#
endif
bool
interacts
=
false
;
#
ifdef
USE_PERIODIC
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE && !interacts; j++) {
real4
delta
=
pos-posBuffer[j]
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
}
...
...
@@ -159,9 +159,10 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__global
const
int*
restrict
rebuildNeighborList
)
{
if
(
rebuildNeighborList[0]
==
0
)
...
...
@@ -204,9 +205,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
real4
blockSizeY
=
sortedBlockBoundingBox[j]
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
...
...
@@ -216,12 +215,12 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
buffer[valuesInBuffer++]
=
y
;
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
}
}
}
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
//
Record
the
positions
the
neighbor
list
is
based
on.
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
71d33617
...
...
@@ -20,8 +20,9 @@ __kernel void computeBornSum(
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms,
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms,
#
else
unsigned
int
numTiles,
#
endif
...
...
@@ -62,7 +63,7 @@ __kernel void computeBornSum(
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
...
...
@@ -111,7 +112,7 @@ __kernel void computeBornSum(
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
...
...
@@ -253,14 +254,13 @@ __kernel void computeBornSum(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
...
...
@@ -321,7 +321,7 @@ __kernel void computeBornSum(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
...
...
@@ -411,8 +411,9 @@ __kernel void computeGBSAForce1(
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
...
...
@@ -452,7 +453,7 @@ __kernel void computeGBSAForce1(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
...
...
@@ -516,7 +517,7 @@ __kernel void computeGBSAForce1(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
...
...
@@ -669,15 +670,13 @@ __kernel void computeGBSAForce1(
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
localData[tgx].x
-=
floor
((
localData[tgx].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
localData[tgx].y
-=
floor
((
localData[tgx].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
localData[tgx].z
-=
floor
((
localData[tgx].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData[tgx],
blockCenterX
)
}
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
posq1
=
posq[atom1]
;
posq1.xyz
-=
floor
(
(
posq1
.xyz-
blockCenterX
.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
...
...
@@ -740,7 +739,7 @@ __kernel void computeGBSAForce1(
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
int
atom2
=
atomIndices[j]
;
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
71d33617
...
...
@@ -23,7 +23,8 @@ __kernel void computeNonbonded(
__global
const
ushort2*
restrict
exclusionTiles,
unsigned
int
startTileIndex,
unsigned
int
numTileIndices
#
ifdef
USE_CUTOFF
,
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
#
endif
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
...
...
@@ -65,7 +66,7 @@ __kernel void computeNonbonded(
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
...
...
@@ -133,7 +134,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
...
...
@@ -291,15 +292,13 @@ __kernel void computeNonbonded(
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
posq1.xyz -= floor(
(posq1
.xyz-
blockCenterX
.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(posq1
,
blockCenterX
)
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
...
...
@@ -364,7 +363,7 @@ __kernel void computeNonbonded(
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
...
...
platforms/opencl/src/kernels/pme.cl
View file @
71d33617
...
...
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
pos.w*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_DOUBLE_PRECISION
atom_add
(
&pmeGrid[2*index],
(
long
)
(
add*0x100000000
))
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
pmeGrid
)
{
__global
real2*
realGrid
=
(
__global
real2*
)
pmeGrid
;
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_
DOUBLE_PRECISIO
N
long
value
=
pmeGrid[2*index
]
;
#
ifdef
USE_
ALTERNATE_MEMORY_ACCESS_PATTER
N
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2
]
;
#
else
long
value
=
pme
Grid[index]
;
long
value
=
fixed
Grid[index]
;
#
endif
real2
realValue
=
(
real2
)
((
real
)
(
value*scale
)
,
0
)
;
realGrid[index]
=
realValue
;
realGrid[index]
=
(
real
)
(
value*scale
)
;
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
__kernel
void
gridSpreadCharge
(
__global
const
real4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
__global
real*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
const
int
firstx
=
get_global_id
(
0
)
*GRID_SIZE_X/get_global_size
(
0
)
;
const
int
lastx
=
(
get_global_id
(
0
)
+1
)
*GRID_SIZE_X/get_global_size
(
0
)
;
if
(
firstx
==
lastx
)
...
...
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index]
.x
+=
EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z
;
pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
}
}
}
...
...
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
#else
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta
)
{
__global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
...
...
@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
}
}
pmeGrid[gridIndex]
=
(
real2
)
(
result*EPSILON_FACTOR
,
0
)
;
pmeGrid[gridIndex] = result*EPSILON_FACTOR;
}
}
#endif
__kernel
void
reciprocalConvolution
(
__global
real2*
restrict
pmeGrid,
__global
real*
restrict
energyBuffer,
__global
const
real*
restrict
pmeBsplineModuliX,
__global
const
real*
restrict
pmeBsplineModuliY,
__global
const
real*
restrict
pmeBsplineModuliZ,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
real
recipScaleFactor
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
energy
=
0.0f
;
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int
kx
=
index/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
int
remainder
=
index-kx*GRID_SIZE_Y*GRID_SIZE_Z
;
int
ky
=
remainder/GRID_SIZE_Z
;
int
kz
=
remainder-ky*GRID_SIZE_Z
;
if
(
kx
==
0
&&
ky
==
0
&&
kz
==
0
)
continue
;
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
...
...
@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index]
=
(
real2
)
(
grid.x*eterm,
grid.y*eterm
)
;
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
if (kx != 0 || ky != 0 || kz != 0) {
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 |
|
kz
!=
0
)
{
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
}
}
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real
2
*
restrict
pmeGrid,
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real*
restrict
pmeGrid,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
__global
int2*
restrict
pmeAtomGridIndex
)
{
const
real4
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
...
...
@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
gridvalue
=
pmeGrid[index]
.x
;
real
gridvalue
=
pmeGrid[index]
;
force.x
+=
ddata[ix].x*data[iy].y*data[iz].z*gridvalue
;
force.y
+=
data[ix].x*ddata[iy].y*data[iz].z*gridvalue
;
force.z
+=
data[ix].x*data[iy].y*ddata[iz].z*gridvalue
;
...
...
platforms/opencl/src/kernels/sort.cl
View file @
71d33617
...
...
@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
float
maxValue
=
(
float
)
(
range[1]
)
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
for
(
uint
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
#
if
defined
(
MAC_AMD_WORKAROUND
)
&&
VALUE_IS_INT2
__global
int*
d
=
(
__global
int*
)
data
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
float
key
=
(
float
)
getValue
(
element
)
;
#
else
float
key
=
(
float
)
getValue
(
data[index]
)
;
#
endif
uint
bucketIndex
=
min
((
uint
)
((
key-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
bucketOfElement[index]
=
bucketIndex
;
...
...
platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
View file @
71d33617
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors.
*
* Portions copyright (c) 2010
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -105,11 +105,68 @@ void testCMAPTorsions() {
}
}
void
testChangingParameters
()
{
// Create a system with two maps and one torsion.
const
int
mapSize
=
8
;
System
system
;
for
(
int
i
=
0
;
i
<
5
;
i
++
)
system
.
addParticle
(
1.0
);
CMAPTorsionForce
*
cmap
=
new
CMAPTorsionForce
();
vector
<
double
>
mapEnergy1
(
mapSize
*
mapSize
);
vector
<
double
>
mapEnergy2
(
mapSize
*
mapSize
);
for
(
int
i
=
0
;
i
<
mapSize
;
i
++
)
{
double
angle1
=
i
*
2
*
M_PI
/
mapSize
;
double
energy1
=
cos
(
angle1
);
for
(
int
j
=
0
;
j
<
mapSize
;
j
++
)
{
double
angle2
=
j
*
2
*
M_PI
/
mapSize
;
double
energy2
=
10
*
sin
(
angle2
);
mapEnergy1
[
i
+
j
*
mapSize
]
=
energy1
+
energy2
;
mapEnergy2
[
i
+
j
*
mapSize
]
=
energy1
-
energy2
;
}
}
cmap
->
addMap
(
mapSize
,
mapEnergy1
);
cmap
->
addMap
(
mapSize
,
mapEnergy2
);
cmap
->
addTorsion
(
0
,
0
,
1
,
2
,
3
,
1
,
2
,
3
,
4
);
system
.
addForce
(
cmap
);
// Set particle positions so angle1=0 and angle2=PI/4.
vector
<
Vec3
>
positions
(
5
);
positions
[
0
]
=
Vec3
(
0
,
0
,
1
);
positions
[
1
]
=
Vec3
(
0
,
0
,
0
);
positions
[
2
]
=
Vec3
(
1
,
0
,
0
);
positions
[
3
]
=
Vec3
(
1
,
0
,
1
);
positions
[
4
]
=
Vec3
(
0.5
,
-
0.5
,
1
);
VerletIntegrator
integrator
(
0.01
);
Context
context
(
system
,
integrator
,
platform
);
context
.
setPositions
(
positions
);
// Check that the energy is correct.
double
energy
=
context
.
getState
(
State
::
Energy
).
getPotentialEnergy
();
ASSERT_EQUAL_TOL
(
1
+
10
*
sin
(
M_PI
/
4
),
energy
,
1e-5
);
// Modify the parameters.
cmap
->
setTorsionParameters
(
0
,
1
,
0
,
1
,
2
,
3
,
1
,
2
,
3
,
4
);
for
(
int
i
=
0
;
i
<
mapSize
*
mapSize
;
i
++
)
mapEnergy2
[
i
]
*=
2.0
;
cmap
->
setMapParameters
(
1
,
mapSize
,
mapEnergy2
);
cmap
->
updateParametersInContext
(
context
);
// See if the results are correct.
energy
=
context
.
getState
(
State
::
Energy
).
getPotentialEnergy
();
ASSERT_EQUAL_TOL
(
2
-
20
*
sin
(
M_PI
/
4
),
energy
,
1e-5
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"OpenCLPrecision"
,
string
(
argv
[
1
]));
testCMAPTorsions
();
testChangingParameters
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
platforms/opencl/tests/TestOpenCLFFT.cpp
View file @
71d33617
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -51,7 +51,7 @@ using namespace std;
static
OpenCLPlatform
platform
;
template
<
class
Real2
>
void
testTransform
()
{
void
testTransform
(
bool
realToComplex
,
int
xsize
,
int
ysize
,
int
zsize
)
{
System
system
;
system
.
addParticle
(
0.0
);
OpenCLPlatform
::
PlatformData
platformData
(
system
,
""
,
""
,
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
),
"false"
);
...
...
@@ -59,7 +59,6 @@ void testTransform() {
context
.
initialize
();
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
int
xsize
=
28
,
ysize
=
25
,
zsize
=
30
;
vector
<
Real2
>
original
(
xsize
*
ysize
*
zsize
);
vector
<
t_complex
>
reference
(
original
.
size
());
for
(
int
i
=
0
;
i
<
(
int
)
original
.
size
();
i
++
)
{
...
...
@@ -67,10 +66,16 @@ void testTransform() {
original
[
i
]
=
value
;
reference
[
i
]
=
t_complex
(
value
.
x
,
value
.
y
);
}
for
(
int
i
=
0
;
i
<
(
int
)
reference
.
size
();
i
++
)
{
if
(
realToComplex
)
reference
[
i
]
=
t_complex
(
i
%
2
==
0
?
original
[
i
/
2
].
x
:
original
[
i
/
2
].
y
,
0
);
else
reference
[
i
]
=
t_complex
(
original
[
i
].
x
,
original
[
i
].
y
);
}
OpenCLArray
grid1
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid1"
);
OpenCLArray
grid2
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid2"
);
grid1
.
upload
(
original
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
,
realToComplex
);
// Perform a forward FFT, then verify the result is correct.
...
...
@@ -80,10 +85,15 @@ void testTransform() {
fftpack_t
plan
;
fftpack_init_3d
(
&
plan
,
xsize
,
ysize
,
zsize
);
fftpack_exec_3d
(
plan
,
FFTPACK_FORWARD
,
&
reference
[
0
],
&
reference
[
0
]);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
ASSERT_EQUAL_TOL
(
reference
[
i
].
re
,
result
[
i
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
i
].
im
,
result
[
i
].
y
,
1e-3
);
}
int
outputZSize
=
(
realToComplex
?
zsize
/
2
+
1
:
zsize
);
for
(
int
x
=
0
;
x
<
xsize
;
x
++
)
for
(
int
y
=
0
;
y
<
ysize
;
y
++
)
for
(
int
z
=
0
;
z
<
outputZSize
;
z
++
)
{
int
index1
=
x
*
ysize
*
zsize
+
y
*
zsize
+
z
;
int
index2
=
x
*
ysize
*
outputZSize
+
y
*
outputZSize
+
z
;
ASSERT_EQUAL_TOL
(
reference
[
index1
].
re
,
result
[
index2
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
index1
].
im
,
result
[
index2
].
y
,
1e-3
);
}
fftpack_destroy
(
plan
);
// Perform a backward transform and see if we get the original values.
...
...
@@ -91,7 +101,8 @@ void testTransform() {
fft
.
execFFT
(
grid2
,
grid1
,
false
);
grid1
.
download
(
result
);
double
scale
=
1.0
/
(
xsize
*
ysize
*
zsize
);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
int
valuesToCheck
=
(
realToComplex
?
original
.
size
()
/
2
:
original
.
size
());
for
(
int
i
=
0
;
i
<
valuesToCheck
;
++
i
)
{
ASSERT_EQUAL_TOL
(
original
[
i
].
x
,
scale
*
result
[
i
].
x
,
1e-4
);
ASSERT_EQUAL_TOL
(
original
[
i
].
y
,
scale
*
result
[
i
].
y
,
1e-4
);
}
...
...
@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"OpenCLPrecision"
,
string
(
argv
[
1
]));
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
testTransform
<
mm_double2
>
();
else
testTransform
<
mm_float2
>
();
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
{
testTransform
<
mm_double2
>
(
false
,
28
,
25
,
30
);
testTransform
<
mm_double2
>
(
true
,
28
,
25
,
25
);
testTransform
<
mm_double2
>
(
true
,
25
,
28
,
25
);
testTransform
<
mm_double2
>
(
true
,
25
,
25
,
28
);
testTransform
<
mm_double2
>
(
true
,
21
,
25
,
27
);
}
else
{
testTransform
<
mm_float2
>
(
false
,
28
,
25
,
30
);
testTransform
<
mm_float2
>
(
true
,
28
,
25
,
25
);
testTransform
<
mm_float2
>
(
true
,
25
,
28
,
25
);
testTransform
<
mm_float2
>
(
true
,
25
,
25
,
28
);
testTransform
<
mm_float2
>
(
true
,
21
,
25
,
27
);
}
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment