Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
59bccb15
Commit
59bccb15
authored
May 15, 2015
by
peastman
Browse files
Merge branch 'master' into pthreads
Conflicts: libraries/pthreads/include/pthread.h
parents
6cf75568
4bdbcf4d
Changes
79
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
823 additions
and
163 deletions
+823
-163
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+4
-5
platforms/opencl/src/OpenCLExpressionUtilities.cpp
platforms/opencl/src/OpenCLExpressionUtilities.cpp
+3
-0
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+136
-30
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+64
-45
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+1
-0
platforms/opencl/src/OpenCLPlatform.cpp
platforms/opencl/src/OpenCLPlatform.cpp
+5
-7
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+6
-3
platforms/opencl/src/kernels/customIntegratorGlobal.cl
platforms/opencl/src/kernels/customIntegratorGlobal.cl
+1
-1
platforms/opencl/src/kernels/customIntegratorPerDof.cl
platforms/opencl/src/kernels/customIntegratorPerDof.cl
+1
-1
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+33
-2
platforms/opencl/src/kernels/fftR2C.cl
platforms/opencl/src/kernels/fftR2C.cl
+166
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+283
-4
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+68
-28
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+0
-6
platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
+1
-1
platforms/opencl/tests/TestOpenCLFFT.cpp
platforms/opencl/tests/TestOpenCLFFT.cpp
+34
-13
platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
+8
-8
platforms/reference/src/SimTKReference/ObcParameters.cpp
platforms/reference/src/SimTKReference/ObcParameters.cpp
+3
-3
platforms/reference/tests/TestReferenceCustomIntegrator.cpp
platforms/reference/tests/TestReferenceCustomIntegrator.cpp
+1
-1
platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
...rms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
+5
-5
No files found.
platforms/opencl/src/OpenCLContext.cpp
View file @
59bccb15
...
...
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one
if
(
i
!=
deviceIndex
&&
deviceIndex
>=
0
&&
deviceIndex
<
(
int
)
devices
.
size
())
continue
;
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
||
devices
[
i
].
getInfo
<
CL_DEVICE_VENDOR
>
()
==
"AMD"
))
continue
;
// The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
))
continue
;
// The CPU device on OS X won't work correctly.
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
processingElementsPerComputeUnit
=
8
;
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
...
...
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
if
(
platformVendor
.
size
()
>=
5
&&
platformVendor
.
substr
(
0
,
5
)
==
"Intel"
)
defaultOptimizationOptions
=
""
;
else
if
(
platformVendor
==
"Apple"
)
defaultOptimizationOptions
=
"-cl-mad-enable -cl-no-signed-zeros"
;
else
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
...
...
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
else
simdWidth
=
1
;
if
(
platformVendor
==
"Apple"
&&
vendor
==
"AMD"
)
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
""
;
if
(
supports64BitGlobalAtomics
)
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
...
...
platforms/opencl/src/OpenCLExpressionUtilities.cpp
View file @
59bccb15
...
...
@@ -455,6 +455,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
case
Operation
::
CEIL
:
out
<<
"ceil("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
")"
;
break
;
case
Operation
::
SELECT
:
out
<<
"("
<<
getTempName
(
node
.
getChildren
()[
0
],
temps
)
<<
" != 0 ? "
<<
getTempName
(
node
.
getChildren
()[
1
],
temps
)
<<
" : "
<<
getTempName
(
node
.
getChildren
()[
2
],
temps
)
<<
")"
;
break
;
default:
throw
OpenMMException
(
"Internal error: Unknown operation in user-defined expression: "
+
node
.
getOperation
().
getName
());
}
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
59bccb15
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -35,25 +35,109 @@
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
);
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
packRealAsComplex
=
false
;
int
packedXSize
=
xsize
;
int
packedYSize
=
ysize
;
int
packedZSize
=
zsize
;
if
(
realToComplex
)
{
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex
=
true
;
int
packedAxis
,
bufferSize
;
if
(
xsize
%
2
==
0
)
{
packedAxis
=
0
;
packedXSize
/=
2
;
bufferSize
=
packedXSize
;
}
else
if
(
ysize
%
2
==
0
)
{
packedAxis
=
1
;
packedYSize
/=
2
;
bufferSize
=
packedYSize
;
}
else
if
(
zsize
%
2
==
0
)
{
packedAxis
=
2
;
packedZSize
/=
2
;
bufferSize
=
packedZSize
;
}
else
packRealAsComplex
=
false
;
if
(
packRealAsComplex
)
{
// Build the kernels for packing and unpacking the data.
map
<
string
,
string
>
defines
;
defines
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
defines
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
defines
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
defines
[
"PACKED_AXIS"
]
=
context
.
intToString
(
packedAxis
);
defines
[
"PACKED_XSIZE"
]
=
context
.
intToString
(
packedXSize
);
defines
[
"PACKED_YSIZE"
]
=
context
.
intToString
(
packedYSize
);
defines
[
"PACKED_ZSIZE"
]
=
context
.
intToString
(
packedZSize
);
defines
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
cl
::
Program
program
=
context
.
createProgram
(
OpenCLKernelSources
::
fftR2C
,
defines
);
packForwardKernel
=
cl
::
Kernel
(
program
,
"packForwardData"
);
unpackForwardKernel
=
cl
::
Kernel
(
program
,
"unpackForwardData"
);
unpackForwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
packBackwardKernel
=
cl
::
Kernel
(
program
,
"packBackwardData"
);
packBackwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
unpackBackwardKernel
=
cl
::
Kernel
(
program
,
"unpackBackwardData"
);
}
}
bool
inputIsReal
=
(
realToComplex
&&
!
packRealAsComplex
);
zkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
true
,
inputIsReal
);
xkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
true
,
inputIsReal
);
ykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
true
,
inputIsReal
);
invzkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
false
,
inputIsReal
);
invxkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
false
,
inputIsReal
);
invykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
false
,
inputIsReal
);
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zthreads
);
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xthreads
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
ythreads
);
cl
::
Kernel
kernel1
=
(
forward
?
zkernel
:
invzkernel
);
cl
::
Kernel
kernel2
=
(
forward
?
xkernel
:
invxkernel
);
cl
::
Kernel
kernel3
=
(
forward
?
ykernel
:
invykernel
);
if
(
packRealAsComplex
)
{
cl
::
Kernel
packKernel
=
(
forward
?
packForwardKernel
:
packBackwardKernel
);
cl
::
Kernel
unpackKernel
=
(
forward
?
unpackForwardKernel
:
unpackBackwardKernel
);
int
gridSize
=
xsize
*
ysize
*
zsize
/
2
;
// Pack the data into a half sized grid.
packKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
packKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
packKernel
,
gridSize
);
// Perform the FFT.
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
gridSize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
gridSize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
gridSize
,
ythreads
);
// Unpack the data.
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
unpackKernel
,
gridSize
);
}
else
{
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
xsize
*
ysize
*
zsize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
xsize
*
ysize
*
zsize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
...
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
)
{
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
while
(
maxThreads
>
128
&&
maxThreads
-
64
>=
zsize
)
maxThreads
-=
64
;
bool
isCPU
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
;
while
(
true
)
{
bool
loopRequired
=
(
zsize
>
maxThreads
||
isCPU
);
...
...
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
sign
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 b5 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
(SIGN)
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
...
...
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 =
sign
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
sign
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"real2 d9 =
(SIGN)
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
(SIGN)
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
...
...
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 =
sign
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"real2 d3 =
(SIGN)
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
...
...
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 =
sign
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"real2 d2 =
(SIGN)
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
...
...
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
)
{
if
(
outputIsPacked
)
source
<<
"if (x < XSIZE/2+1)
\n
"
;
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z];
\n
"
;
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
if
(
outputIsPacked
)
{
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
}
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
...
...
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_PACKED"
]
=
(
inputIsReal
&&
axis
==
0
&&
!
forward
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
...
...
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue
;
}
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
kernel
.
setArg
(
2
,
bufferSize
,
NULL
);
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
5
,
bufferSize
,
NULL
);
return
kernel
;
}
}
platforms/opencl/src/OpenCLKernels.cpp
View file @
59bccb15
...
...
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines
[
"GRID_SIZE_Y"
]
=
cl
.
intToString
(
gridSizeY
);
pmeDefines
[
"GRID_SIZE_Z"
]
=
cl
.
intToString
(
gridSizeZ
);
pmeDefines
[
"EPSILON_FACTOR"
]
=
cl
.
doubleToString
(
sqrt
(
ONE_4PI_EPS0
));
pmeDefines
[
"M_PI"
]
=
cl
.
doubleToString
(
M_PI
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
if
(
deviceIsCpu
)
pmeDefines
[
"DEVICE_IS_CPU"
]
=
"1"
;
...
...
@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeGrid
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid"
);
cl.addAutoclearBuffer(*pmeGrid);
pmeGrid2
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid2"
);
if
(
cl
.
getSupports64BitGlobalAtomics
())
cl
.
addAutoclearBuffer
(
*
pmeGrid2
);
else
cl
.
addAutoclearBuffer
(
*
pmeGrid
);
pmeBsplineModuliX
=
new
OpenCLArray
(
cl
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
=
new
OpenCLArray
(
cl
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
OpenCLArray
(
cl
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
...
...
@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomRange
=
OpenCLArray
::
create
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
OpenCLArray
::
create
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
(
cl
,
new
SortTrait
(),
cl
.
getNumAtoms
());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
true
);
string
vendor
=
cl
.
getDevice
().
getInfo
<
CL_DEVICE_VENDOR
>
();
usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
bool
isNvidia
=
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
);
if
(
isNvidia
)
pmeDefines
[
"USE_ALTERNATE_MEMORY_ACCESS_PATTERN"
]
=
"1"
;
usePmeQueue
=
isNvidia
;
if
(
usePmeQueue
)
{
pmeQueue
=
cl
::
CommandQueue
(
cl
.
getContext
(),
cl
.
getDevice
());
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
...
...
@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel
=
cl
::
Kernel
(
program
,
"recordZIndex"
);
pmeSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cl
::
Kernel
(
program
,
"reciprocalConvolution"
);
pmeEvalEnergyKernel
=
cl
::
Kernel
(
program
,
"gridEvaluateEnergy"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
...
...
@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeAtomRange
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid2
->
getDeviceBuffer
());
else
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
pmeAtomGridIndex
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeGrid
->
getDeviceBuffer
());
}
}
}
...
...
@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl
.
executeKernel
(
ewaldForcesKernel
,
cl
.
getNumAtoms
());
}
if
(
pmeGrid
!=
NULL
&&
includeReciprocal
)
{
if (usePmeQueue)
if
(
usePmeQueue
&&
!
includeEnergy
)
cl
.
setQueue
(
pmeQueue
);
// Invert the periodic box vectors.
...
...
@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
fft
->
execFFT
(
*
pmeGrid
,
*
pmeGrid2
,
true
);
mm_double4
boxSize
=
cl
.
getPeriodicBoxSizeDouble
();
double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
if
(
cl
.
getUseDoublePrecision
())
{
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
4
,
recipBoxVectors
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
2
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
0
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
7
,
recipBoxVectors
[
2
]);
}
else
{
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor);
}
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
4
,
recipBoxVectorsFloat
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
2
]);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
0
]);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
7
,
recipBoxVectorsFloat
[
2
]);
}
if
(
includeEnergy
)
cl
.
executeKernel
(
pmeEvalEnergyKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
setPeriodicBoxSizeArg
(
cl
,
pmeInterpolateForceKernel
,
3
);
...
...
@@ -5972,7 +5993,7 @@ string OpenCLIntegrateCustomStepKernel::createGlobalComputation(const string& va
variables
[
"dt"
]
=
"dt[0].y"
;
variables
[
"uniform"
]
=
"uniform"
;
variables
[
"gaussian"
]
=
"gaussian"
;
variables[energyName] = "energy
[0]
";
variables
[
energyName
]
=
"energy"
;
for
(
int
i
=
0
;
i
<
integrator
.
getNumGlobalVariables
();
i
++
)
variables
[
integrator
.
getGlobalVariableName
(
i
)]
=
"globals["
+
cl
.
intToString
(
i
)
+
"]"
;
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
...
...
@@ -6008,7 +6029,7 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va
variables
[
"m"
]
=
"mass"
;
variables
[
"dt"
]
=
"stepSize"
;
if
(
energyName
!=
""
)
variables[energyName] = "energy
[0]
";
variables
[
energyName
]
=
"energy"
;
for
(
int
i
=
0
;
i
<
integrator
.
getNumGlobalVariables
();
i
++
)
variables
[
integrator
.
getGlobalVariableName
(
i
)]
=
"globals["
+
cl
.
intToString
(
i
)
+
"]"
;
for
(
int
i
=
0
;
i
<
integrator
.
getNumPerDofVariables
();
i
++
)
...
...
@@ -6238,8 +6259,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
globalValues
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
contextParameterValues
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
sumBuffer
->
getDeviceBuffer
());
index += 3;
kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
index
+=
4
;
for
(
int
i
=
0
;
i
<
(
int
)
perDofValues
->
getBuffers
().
size
();
i
++
)
kernel
.
setArg
<
cl
::
Memory
>
(
index
++
,
perDofValues
->
getBuffers
()[
i
].
getMemory
());
if
(
stepType
[
step
]
==
CustomIntegrator
::
ComputeSum
)
{
...
...
@@ -6284,8 +6304,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
integration
.
getStepSize
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
globalValues
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
contextParameterValues
->
getDeviceBuffer
());
index += 2;
kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
}
else
if
(
stepType
[
step
]
==
CustomIntegrator
::
ConstrainPositions
)
{
// Apply position constraints.
...
...
@@ -6326,16 +6344,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
randomKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
uniformRandoms
->
getDeviceBuffer
());
randomKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
randomSeed
->
getDeviceBuffer
());
// Create the kernel for summing the potential energy.
cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
sumPotentialEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
int index = 0;
sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
sumPotentialEnergyKernel.setArg<cl_int>(index++, 0);
sumPotentialEnergyKernel.setArg<cl_int>(index++, cl.getEnergyBuffer().getSize());
// Create the kernel for computing kinetic energy.
stringstream
computeKE
;
...
...
@@ -6359,9 +6367,9 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
replacements
[
"PARAMETER_ARGUMENTS"
]
=
args
.
str
();
if
(
defines
.
find
(
"LOAD_POS_AS_DELTA"
)
!=
defines
.
end
())
defines
.
erase
(
"LOAD_POS_AS_DELTA"
);
program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customIntegratorPerDof, replacements), defines);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
replaceStrings
(
OpenCLKernelSources
::
customIntegratorPerDof
,
replacements
),
defines
);
kineticEnergyKernel
=
cl
::
Kernel
(
program
,
"computePerDof"
);
index = 0;
int
index
=
0
;
kineticEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kineticEnergyKernel
,
index
++
);
kineticEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
integration
.
getPosDelta
().
getDeviceBuffer
());
...
...
@@ -6373,7 +6381,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kineticEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
sumBuffer
->
getDeviceBuffer
());
index
+=
2
;
kineticEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
uniformRandoms
->
getDeviceBuffer
());
kineticEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
if
(
cl
.
getUseDoublePrecision
())
kineticEnergyKernel
.
setArg
<
cl_double
>
(
index
++
,
0.0
);
else
kineticEnergyKernel
.
setArg
<
cl_float
>
(
index
++
,
0.0
f
);
for
(
int
i
=
0
;
i
<
(
int
)
perDofValues
->
getBuffers
().
size
();
i
++
)
kineticEnergyKernel
.
setArg
<
cl
::
Memory
>
(
index
++
,
perDofValues
->
getBuffers
()[
i
].
getMemory
());
keNeedsForce
=
usesVariable
(
keExpression
,
"f"
);
...
...
@@ -6480,9 +6491,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
}
else
{
recordChangedParameters
(
context
);
context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
if (computeEnergy)
cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
energy
=
context
.
calcForcesAndEnergy
(
computeForce
,
computeEnergy
,
forceGroup
[
i
]);
forcesAreValid
=
true
;
}
}
...
...
@@ -6490,6 +6499,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
10
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
kernels
[
i
][
0
].
setArg
<
cl
::
Buffer
>
(
9
,
integration
.
getRandom
().
getDeviceBuffer
());
kernels
[
i
][
0
].
setArg
<
cl
::
Buffer
>
(
11
,
uniformRandoms
->
getDeviceBuffer
());
if
(
cl
.
getUseDoublePrecision
())
kernels
[
i
][
0
].
setArg
<
cl_double
>
(
12
,
energy
);
else
kernels
[
i
][
0
].
setArg
<
cl_float
>
(
12
,
(
cl_float
)
energy
);
if
(
requiredUniform
[
i
]
>
0
)
cl
.
executeKernel
(
randomKernel
,
numAtoms
);
cl
.
executeKernel
(
kernels
[
i
][
0
],
numAtoms
);
...
...
@@ -6497,12 +6510,20 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
else
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputeGlobal
&&
!
merged
[
i
])
{
kernels
[
i
][
0
].
setArg
<
cl_float
>
(
3
,
(
cl_float
)
SimTKOpenMMUtilities
::
getUniformlyDistributedRandomNumber
());
kernels
[
i
][
0
].
setArg
<
cl_float
>
(
4
,
(
cl_float
)
SimTKOpenMMUtilities
::
getNormallyDistributedRandomNumber
());
if
(
cl
.
getUseDoublePrecision
())
kernels
[
i
][
0
].
setArg
<
cl_double
>
(
5
,
energy
);
else
kernels
[
i
][
0
].
setArg
<
cl_float
>
(
5
,
(
cl_float
)
energy
);
cl
.
executeKernel
(
kernels
[
i
][
0
],
1
,
1
);
}
else
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputeSum
)
{
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
10
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
kernels
[
i
][
0
].
setArg
<
cl
::
Buffer
>
(
9
,
integration
.
getRandom
().
getDeviceBuffer
());
kernels
[
i
][
0
].
setArg
<
cl
::
Buffer
>
(
11
,
uniformRandoms
->
getDeviceBuffer
());
if
(
cl
.
getUseDoublePrecision
())
kernels
[
i
][
0
].
setArg
<
cl_double
>
(
12
,
energy
);
else
kernels
[
i
][
0
].
setArg
<
cl_float
>
(
12
,
(
cl_float
)
energy
);
if
(
requiredUniform
[
i
]
>
0
)
cl
.
executeKernel
(
randomKernel
,
numAtoms
);
cl
.
clearBuffer
(
*
sumBuffer
);
...
...
@@ -6552,9 +6573,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
bool
willNeedEnergy
=
false
;
for
(
int
i
=
0
;
i
<
integrator
.
getNumComputations
();
i
++
)
willNeedEnergy
|=
needsEnergy
[
i
];
context.calcForcesAndEnergy(true, willNeedEnergy, -1);
if (willNeedEnergy)
cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
energy
=
context
.
calcForcesAndEnergy
(
true
,
willNeedEnergy
,
-
1
);
forcesAreValid
=
true
;
}
cl
.
clearBuffer
(
*
sumBuffer
);
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
59bccb15
...
...
@@ -303,6 +303,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
defines
[
"PADDED_CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
paddedCutoff
*
paddedCutoff
);
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"SIMD_WIDTH"
]
=
context
.
intToString
(
context
.
getSIMDWidth
());
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
int
maxExclusions
=
0
;
...
...
platforms/opencl/src/OpenCLPlatform.cpp
View file @
59bccb15
...
...
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool
OpenCLPlatform
::
isPlatformSupported
()
{
// Return false for OpenCL implementations that are known
// to be buggy (Apple OSX
since 10.7.5)
// to be buggy (Apple OS
X
prior to 10.10).
#ifdef __APPLE__
char
str
[
256
];
...
...
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if
(
sscanf
(
str
,
"%d.%d.%d"
,
&
major
,
&
minor
,
&
micro
)
!=
3
)
return
false
;
if
((
major
>
11
)
||
(
major
==
11
&&
minor
>
4
)
||
(
major
==
11
&&
minor
==
4
&&
micro
>=
2
))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
// point at which a number of serious bugs were introduced into the
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
if
(
major
<
14
||
(
major
==
14
&&
minor
<
3
))
// 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// contained a number of serious bugs in the Apple OpenCL libraries.
// (See https://github.com/SimTk/openmm/issues/395 for example.)
return
false
;
#endif
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
59bccb15
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
trait
->
getDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
...
...
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
isShortList
=
(
length
<=
maxLocalBuffer
&&
length
<
maxShortListSize
);
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
platforms/opencl/src/kernels/customIntegratorGlobal.cl
View file @
59bccb15
__kernel
void
computeGlobal
(
__global
mixed2*
restrict
dt,
__global
mixed*
restrict
globals,
__global
mixed*
restrict
params,
float
uniform,
float
gaussian,
__global
const
real
*
restrict
energy
)
{
float
uniform,
float
gaussian,
const
real
energy
)
{
COMPUTE_STEP
}
platforms/opencl/src/kernels/customIntegratorPerDof.cl
View file @
59bccb15
...
...
@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
__kernel
void
computePerDof
(
__global
real4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
__global
mixed4*
restrict
posDelta,
__global
mixed4*
restrict
velm,
__global
const
real4*
restrict
force,
__global
const
mixed2*
restrict
dt,
__global
const
mixed*
restrict
globals,
__global
const
mixed*
restrict
params,
__global
mixed*
restrict
sum,
__global
const
float4*
restrict
gaussianValues,
unsigned
int
gaussianBaseIndex,
__global
const
float4*
restrict
uniformValues,
__global
const
real
*
restrict
energy
unsigned
int
gaussianBaseIndex,
__global
const
float4*
restrict
uniformValues,
const
real
energy
PARAMETER_ARGUMENTS
)
{
mixed
stepSize
=
dt[0].y
;
int
index
=
get_global_id
(
0
)
;
...
...
platforms/opencl/src/kernels/fft.cl
View file @
59bccb15
...
...
@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return
(
real2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
}
/**
*
Load
a
value
from
the
half-complex
grid
produces
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*/
__kernel
void
execFFT
(
__global
const
real2
*
restrict
in,
__global
real2
*
restrict
out,
int
sign,
__local
real2*
restrict
w,
__kernel
void
execFFT
(
__global
const
INPUT_TYPE
*
restrict
in,
__global
OUTPUT_TYPE
*
restrict
out,
__local
real2*
restrict
w,
__local
real2*
restrict
data0,
__local
real2*
restrict
data1
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
-
sign
*i*2*M_PI/ZSIZE
)
,
sin
(
-
sign
*i*2*M_PI/ZSIZE
))
;
w[i]
=
(
real2
)
(
cos
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
)
,
sin
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
baseIndex
=
get_group_id
(
0
)
*BLOCKS_PER_GROUP
; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int
index
=
baseIndex+get_local_id
(
0
)
/ZSIZE
;
int
x
=
index/YSIZE
;
int
y
=
index-x*YSIZE
;
#
if
OUTPUT_IS_PACKED
if
(
x
<
XSIZE/2+1
)
{
#
endif
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
#
if
INPUT_IS_REAL
data0[z]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z],
0
)
;
#
elif
INPUT_IS_PACKED
data0[z]
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
else
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
endif
#
else
if
(
index
<
XSIZE*YSIZE
)
#
if
INPUT_IS_REAL
data0[get_local_id
(
0
)
]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE],
0
)
;
#
elif
INPUT_IS_PACKED
data0[get_local_id
(
0
)
]
=
loadComplexValue
(
in,
x,
y,
get_local_id
(
0
)
%ZSIZE
)
;
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE]
;
#
endif
#
endif
#
if
OUTPUT_IS_PACKED
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
...
...
platforms/opencl/src/kernels/fftR2C.cl
0 → 100644
View file @
59bccb15
/**
*
Combine
the
two
halves
of
a
real
grid
into
a
complex
grid
that
is
half
as
large.
*/
__kernel
void
packForwardData
(
__global
const
real*
restrict
in,
__global
real2*
restrict
out
)
{
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
#
if
PACKED_AXIS
==
0
real2
value
=
(
real2
)
(
in[2*x*YSIZE*ZSIZE+y*ZSIZE+z],
in[
(
2*x+1
)
*YSIZE*ZSIZE+y*
ZSIZE+z]
)
;
#
elif
PACKED_AXIS
==
1
real2
value
=
(
real2
)
(
in[x*YSIZE*ZSIZE+2*y*ZSIZE+z],
in[x*YSIZE*ZSIZE+
(
2*y+1
)
*ZSIZE+z]
)
;
#
else
real2
value
=
(
real2
)
(
in[x*YSIZE*ZSIZE+y*ZSIZE+2*z],
in[x*YSIZE*ZSIZE+y*ZSIZE+
(
2*z+1
)
]
)
;
#
endif
out[index]
=
value
;
}
}
/**
*
Split
the
transformed
data
back
into
a
full
sized,
symmetric
grid.
*/
__kernel
void
unpackForwardData
(
__global
const
real2*
restrict
in,
__global
real2*
restrict
out,
__local
real2*
restrict
w
)
{
//
Compute
the
phase
factors.
#
if
PACKED_AXIS
==
0
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_XSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/XSIZE
)
,
cos
(
i*2*M_PI/XSIZE
))
;
#
elif
PACKED_AXIS
==
1
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_YSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/YSIZE
)
,
cos
(
i*2*M_PI/YSIZE
))
;
#
else
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
sin
(
i*2*M_PI/ZSIZE
)
,
cos
(
i*2*M_PI/ZSIZE
))
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Transform
the
data.
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
const
int
outputZSize
=
ZSIZE/2+1
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
real2
z1
=
in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z]
;
real2
z2
=
in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp]
;
#
if
PACKED_AXIS
==
0
real2
wfac
=
w[x]
;
#
elif
PACKED_AXIS
==
1
real2
wfac
=
w[y]
;
#
else
real2
wfac
=
w[z]
;
#
endif
real2
output
=
(
real2
)
((
z1.x+z2.x
-
wfac.x*
(
z1.x-z2.x
)
+
wfac.y*
(
z1.y+z2.y
))
/2,
(
z1.y-z2.y
-
wfac.y*
(
z1.x-z2.x
)
-
wfac.x*
(
z1.y+z2.y
))
/2
)
;
if
(
z
<
outputZSize
)
out[x*YSIZE*outputZSize+y*outputZSize+z]
=
output
;
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
zp
=
(
z
==
0
?
0
:
ZSIZE-z
)
;
if
(
zp
<
outputZSize
)
{
#
if
PACKED_AXIS
==
0
if
(
x
==
0
)
out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
elif
PACKED_AXIS
==
1
if
(
y
==
0
)
out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
else
if
(
z
==
0
)
out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
endif
else
out[xp*YSIZE*outputZSize+yp*outputZSize+zp]
=
(
real2
)
(
output.x,
-output.y
)
;
}
}
}
/**
*
Load
a
value
from
the
half-complex
grid
produced
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
*
Repack
the
symmetric
complex
grid
into
one
half
as
large
in
preparation
for
doing
an
inverse
complex-to-real
transform.
*/
__kernel
void
packBackwardData
(
__global
const
real2*
restrict
in,
__global
real2*
restrict
out,
__local
real2*
restrict
w
)
{
//
Compute
the
phase
factors.
#
if
PACKED_AXIS
==
0
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_XSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/XSIZE
)
,
sin
(
i*2*M_PI/XSIZE
))
;
#
elif
PACKED_AXIS
==
1
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_YSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/YSIZE
)
,
sin
(
i*2*M_PI/YSIZE
))
;
#
else
for
(
int
i
=
get_local_id
(
0
)
; i < PACKED_ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
i*2*M_PI/ZSIZE
)
,
sin
(
i*2*M_PI/ZSIZE
))
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Transform
the
data.
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
real2
z1
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
if
PACKED_AXIS
==
0
real2
wfac
=
w[x]
;
real2
z2
=
loadComplexValue
(
in,
PACKED_XSIZE-x,
yp,
zp
)
;
#
elif
PACKED_AXIS
==
1
real2
wfac
=
w[y]
;
real2
z2
=
loadComplexValue
(
in,
xp,
PACKED_YSIZE-y,
zp
)
;
#
else
real2
wfac
=
w[z]
;
real2
z2
=
loadComplexValue
(
in,
xp,
yp,
PACKED_ZSIZE-z
)
;
#
endif
real2
even
=
(
real2
)
((
z1.x+z2.x
)
/2,
(
z1.y-z2.y
)
/2
)
;
real2
odd
=
(
real2
)
((
z1.x-z2.x
)
/2,
(
z1.y+z2.y
)
/2
)
;
odd
=
(
real2
)
(
odd.x*wfac.x-odd.y*wfac.y,
odd.y*wfac.x+odd.x*wfac.y
)
;
out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z]
=
(
real2
)
(
even.x-odd.y,
even.y+odd.x
)
;
}
}
/**
*
Split
the
data
back
into
a
full
sized,
real
grid
after
an
inverse
transform.
*/
__kernel
void
unpackBackwardData
(
__global
const
real2*
restrict
in,
__global
real*
restrict
out
)
{
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
y
=
remainder/PACKED_ZSIZE
;
int
z
=
remainder-y*PACKED_ZSIZE
;
real2
value
=
2*in[index]
;
#
if
PACKED_AXIS
==
0
out[2*x*YSIZE*ZSIZE+y*ZSIZE+z]
=
value.x
;
out[
(
2*x+1
)
*YSIZE*ZSIZE+y*
ZSIZE+z]
=
value.y
;
#
elif
PACKED_AXIS
==
1
out[x*YSIZE*ZSIZE+2*y*ZSIZE+z]
=
value.x
;
out[x*YSIZE*ZSIZE+
(
2*y+1
)
*ZSIZE+z]
=
value.y
;
#
else
out[x*YSIZE*ZSIZE+y*ZSIZE+2*z]
=
value.x
;
out[x*YSIZE*ZSIZE+y*ZSIZE+
(
2*z+1
)
]
=
value.y
;
#
endif
}
}
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
59bccb15
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
BUFFER_SIZE
256
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
...
...
@@ -65,6 +64,10 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
}
}
#
if
SIMD_WIDTH
<=
32
#
define
BUFFER_SIZE
256
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
...
...
@@ -133,8 +136,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
int
block2
=
block2Base+indexInWarp
;
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
)
;
if
(
includeBlock2
)
{
real4
blockCenterY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockCenter[block2]
:
(
real4
)
(
0
))
;
real4
blockSizeY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockBoundingBox[block2]
:
(
real4
)
(
0
))
;
real4
blockCenterY
=
sortedBlockCenter[block2]
;
real4
blockSizeY
=
sortedBlockBoundingBox[block2]
;
real4
blockDelta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
...
...
@@ -252,3 +255,279 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
#
else
//
This
is
the
old
implementation
of
finding
interacting
blocks.
It
is
quite
a
bit
more
complicated,
//
and
slower
on
most
GPUs.
On
AMD,
however,
it
is
faster,
so
we
keep
it
around
to
use
there.
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
WARP_SIZE
32
#
define
INVALID
0xFFFF
/**
*
Perform
a
parallel
prefix
sum
over
an
array.
The
input
values
are
all
assumed
to
be
0
or
1.
*/
void
prefixSum
(
__local
short*
sum,
__local
ushort2*
temp
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
sum[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
whichBuffer
=
0
;
for
(
int
offset
=
1
; offset < BUFFER_SIZE; offset *= 2) {
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].y
=
(
i
<
offset
?
temp[i].x
:
temp[i].x+temp[i-offset].x
)
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
(
i
<
offset
?
temp[i].y
:
temp[i].y+temp[i-offset].y
)
;
whichBuffer
=
1-whichBuffer
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
temp[i].x
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
temp[i].y
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks,
identifies
interactions
*
in
them,
and
writes
the
result
to
global
memory.
*/
void
storeInteractionData
(
int
x,
__local
unsigned
short*
buffer,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
atoms,
__local
int*
numAtoms,
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
int*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
const
real4*
posq,
__local
real4*
posBuffer,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
real4
pos
=
posq[x*TILE_SIZE+get_local_id
(
0
)
]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
}
#
endif
posBuffer[get_local_id
(
0
)
]
=
pos
;
}
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
(
buffer[i]
==
INVALID
?
0
:
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
//
Compact
the
buffer.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
buffer[i]
!=
INVALID
)
temp[sum[i]-1].x
=
buffer[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
temp[i].x
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Loop
over
the
tiles
and
find
specific
interactions
in
them.
const
int
indexInWarp
=
get_local_id
(
0
)
%WARP_SIZE
;
for
(
int
base
=
0
; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for
(
int
i
=
get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
//
Check
each
atom
in
block
Y
for
interactions.
real4
pos
=
posq[buffer[base+i]*TILE_SIZE+indexInWarp]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
#
endif
bool
interacts
=
false
;
#
ifdef
USE_PERIODIC
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
pos-posBuffer[j]
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
interacts |
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
#
ifdef
USE_PERIODIC
}
#
endif
sum[i*WARP_SIZE+indexInWarp]
=
(
interacts
?
1
:
0
)
;
}
for
(
int
i
=
numValid-base+get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp]
=
0
;
//
Compact
the
list
of
atoms.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
sum[i]
!=
(
i
==
0
?
0
:
sum[i-1]
))
atoms[*numAtoms+sum[i]-1]
=
buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp
;
//
Store
them
to
global
memory.
int
atomsToStore
=
*numAtoms+sum[BUFFER_SIZE-1]
;
bool
storePartialTile
=
(
finish
&&
base
>=
numValid-BUFFER_SIZE/WARP_SIZE
)
;
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore+TILE_SIZE-1
)
/TILE_SIZE
:
atomsToStore/TILE_SIZE
)
;
if
(
tilesToStore
>
0
)
{
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
=
atomsToStore-tilesToStore*TILE_SIZE
;
if
(
*baseIndex+tilesToStore
<=
maxTiles
)
{
if
(
get_local_id
(
0
)
<
tilesToStore
)
interactingTiles[*baseIndex+get_local_id
(
0
)
]
=
x
;
for
(
int
i
=
get_local_id
(
0
)
; i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i]
=
(
i
<
atomsToStore
?
atoms[i]
:
NUM_ATOMS
)
;
}
}
else
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
+=
sum[BUFFER_SIZE-1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
*numAtoms
&&
!storePartialTile
)
atoms[get_local_id
(
0
)
]
=
atoms[tilesToStore*TILE_SIZE+get_local_id
(
0
)
]
;
}
if
(
numValid
==
0
&&
*numAtoms
>
0
&&
finish
)
{
//
We
didn
't
have
any
more
tiles
to
process,
but
there
were
some
atoms
left
over
from
a
//
previous
call
to
this
function.
Save
them
now.
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
*baseIndex
<
maxTiles
)
{
if
(
get_local_id
(
0
)
==
0
)
interactingTiles[*baseIndex]
=
x
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
*numAtoms
?
atoms[get_local_id
(
0
)
]
:
NUM_ATOMS
)
;
}
}
//
Reset
the
buffer
for
processing
more
tiles.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__global
const
int*
restrict
rebuildNeighborList
)
{
__local
unsigned
short
buffer[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
__local
ushort2
temp[BUFFER_SIZE]
;
__local
int
atoms[BUFFER_SIZE+TILE_SIZE]
;
__local
real4
posBuffer[TILE_SIZE]
;
__local
int
exclusionsForX[MAX_EXCLUSIONS]
;
__local
int
bufferFull
;
__local
int
globalIndex
;
__local
int
numAtoms
;
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
The
IF
actually
causes
the
write
to
never
be
executed,
its
presence
is
all
that
is
needed.
//
AMD
APP
SDK
2.4
has
this
problem.
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
interactionCount
)
[sizeof
(
unsigned
int
)
+1]
=
0
;
#
endif
if
(
rebuildNeighborList[0]
==
0
)
return
; // The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
for
(
int
i
=
0
; i < BUFFER_GROUPS; ++i)
buffer[i*GROUP_SIZE+get_local_id
(
0
)
]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Loop
over
blocks
sorted
by
size.
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if
(
get_local_id
(
0
)
==
get_local_size
(
0
)
-1
)
numAtoms
=
0
;
real2
sortedKey
=
sortedBlocks[i]
;
int
x
=
(
int
)
sortedKey.y
;
real4
blockCenterX
=
sortedBlockCenter[i]
;
real4
blockSizeX
=
sortedBlockBoundingBox[i]
;
//
Load
exclusion
data
for
block
x.
const
int
exclusionStart
=
exclusionRowIndices[x]
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
for
(
int
j
=
get_local_id
(
0
)
; j < numExclusions; j += get_local_size(0))
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
for
(
int
base
=
i+1
; base < NUM_BLOCKS; base += get_local_size(0)) {
int
j
=
base+get_local_id
(
0
)
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks[j]
:
(
real2
)
0
)
;
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter[j]
:
(
real4
)
0
)
;
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox[j]
:
(
real4
)
0
)
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
j
<
NUM_BLOCKS
&&
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
&&
!hasExclusions
)
{
//
Add
this
tile
to
the
buffer.
int
bufferIndex
=
valuesInBuffer*GROUP_SIZE+get_local_id
(
0
)
;
buffer[bufferIndex]
=
y
;
valuesInBuffer++
;
if
(
!bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
bufferFull
)
{
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
#
endif
platforms/opencl/src/kernels/pme.cl
View file @
59bccb15
...
...
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
pos.w*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_DOUBLE_PRECISION
atom_add
(
&pmeGrid[2*index],
(
long
)
(
add*0x100000000
))
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
pmeGrid
)
{
__global
real2*
realGrid
=
(
__global
real2*
)
pmeGrid
;
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_
DOUBLE_PRECISIO
N
long
value
=
pmeGrid[2*index
]
;
#
ifdef
USE_
ALTERNATE_MEMORY_ACCESS_PATTER
N
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2
]
;
#
else
long
value
=
pme
Grid[index]
;
long
value
=
fixed
Grid[index]
;
#
endif
real2
realValue
=
(
real2
)
((
real
)
(
value*scale
)
,
0
)
;
realGrid[index]
=
realValue
;
realGrid[index]
=
(
real
)
(
value*scale
)
;
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
__kernel
void
gridSpreadCharge
(
__global
const
real4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
__global
real*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
const
int
firstx
=
get_global_id
(
0
)
*GRID_SIZE_X/get_global_size
(
0
)
;
const
int
lastx
=
(
get_global_id
(
0
)
+1
)
*GRID_SIZE_X/get_global_size
(
0
)
;
if
(
firstx
==
lastx
)
...
...
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index]
.x
+=
EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z
;
pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
}
}
}
...
...
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
#else
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta
)
{
__global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
...
...
@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
}
}
pmeGrid[gridIndex]
=
(
real2
)
(
result*EPSILON_FACTOR
,
0
)
;
pmeGrid[gridIndex] = result*EPSILON_FACTOR;
}
}
#endif
__kernel
void
reciprocalConvolution
(
__global
real2*
restrict
pmeGrid,
__global
real*
restrict
energyBuffer,
__global
const
real*
restrict
pmeBsplineModuliX,
__global
const
real*
restrict
pmeBsplineModuliY,
__global
const
real*
restrict
pmeBsplineModuliZ,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
real
recipScaleFactor
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
energy
=
0.0f
;
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int
kx
=
index/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
int
remainder
=
index-kx*GRID_SIZE_Y*GRID_SIZE_Z
;
int
ky
=
remainder/GRID_SIZE_Z
;
int
kz
=
remainder-ky*GRID_SIZE_Z
;
if
(
kx
==
0
&&
ky
==
0
&&
kz
==
0
)
continue
;
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int ky = remainder/(GRID_SIZE_Z/2+1);
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
...
...
@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kx != 0 || ky != 0 || kz != 0) {
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 |
|
kz
!=
0
)
{
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
}
}
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real
2
*
restrict
pmeGrid,
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real*
restrict
pmeGrid,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
__global
int2*
restrict
pmeAtomGridIndex
)
{
const
real4
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
...
...
@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
gridvalue
=
pmeGrid[index]
.x
;
real
gridvalue
=
pmeGrid[index]
;
force.x
+=
ddata[ix].x*data[iy].y*data[iz].z*gridvalue
;
force.y
+=
data[ix].x*ddata[iy].y*data[iz].z*gridvalue
;
force.z
+=
data[ix].x*data[iy].y*ddata[iz].z*gridvalue
;
...
...
platforms/opencl/src/kernels/sort.cl
View file @
59bccb15
...
...
@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
float
maxValue
=
(
float
)
(
range[1]
)
;
float
bucketWidth
=
(
maxValue-minValue
)
/numBuckets
;
for
(
uint
index
=
get_global_id
(
0
)
; index < length; index += get_global_size(0)) {
#
if
defined
(
MAC_AMD_WORKAROUND
)
&&
VALUE_IS_INT2
__global
int*
d
=
(
__global
int*
)
data
;
int2
element
=
(
int2
)
(
d[2*index],
d[2*index+1]
)
;
float
key
=
(
float
)
getValue
(
element
)
;
#
else
float
key
=
(
float
)
getValue
(
data[index]
)
;
#
endif
uint
bucketIndex
=
min
((
uint
)
((
key-minValue
)
/bucketWidth
)
,
numBuckets-1
)
;
offsetInBucket[index]
=
atom_inc
(
&bucketOffset[bucketIndex]
)
;
bucketOfElement[index]
=
bucketIndex
;
...
...
platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
View file @
59bccb15
...
...
@@ -334,7 +334,7 @@ void testMonteCarlo() {
integrator
.
addComputePerDof
(
"oldx"
,
"x"
);
integrator
.
addComputePerDof
(
"x"
,
"x+dt*gaussian"
);
integrator
.
addComputeGlobal
(
"accept"
,
"step(exp((oldE-energy)/kT)-uniform)"
);
integrator
.
addComputePerDof
(
"x"
,
"
accept*x + (1-
accept
)*
oldx"
);
integrator
.
addComputePerDof
(
"x"
,
"
select(
accept
, x,
oldx
)
"
);
HarmonicBondForce
*
forceField
=
new
HarmonicBondForce
();
forceField
->
addBond
(
0
,
1
,
2.0
,
10.0
);
system
.
addForce
(
forceField
);
...
...
platforms/opencl/tests/TestOpenCLFFT.cpp
View file @
59bccb15
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -51,7 +51,7 @@ using namespace std;
static
OpenCLPlatform
platform
;
template
<
class
Real2
>
void
testTransform
()
{
void
testTransform
(
bool
realToComplex
,
int
xsize
,
int
ysize
,
int
zsize
)
{
System
system
;
system
.
addParticle
(
0.0
);
OpenCLPlatform
::
PlatformData
platformData
(
system
,
""
,
""
,
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
),
"false"
);
...
...
@@ -59,7 +59,6 @@ void testTransform() {
context
.
initialize
();
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
int
xsize
=
28
,
ysize
=
25
,
zsize
=
30
;
vector
<
Real2
>
original
(
xsize
*
ysize
*
zsize
);
vector
<
t_complex
>
reference
(
original
.
size
());
for
(
int
i
=
0
;
i
<
(
int
)
original
.
size
();
i
++
)
{
...
...
@@ -67,10 +66,16 @@ void testTransform() {
original
[
i
]
=
value
;
reference
[
i
]
=
t_complex
(
value
.
x
,
value
.
y
);
}
for
(
int
i
=
0
;
i
<
(
int
)
reference
.
size
();
i
++
)
{
if
(
realToComplex
)
reference
[
i
]
=
t_complex
(
i
%
2
==
0
?
original
[
i
/
2
].
x
:
original
[
i
/
2
].
y
,
0
);
else
reference
[
i
]
=
t_complex
(
original
[
i
].
x
,
original
[
i
].
y
);
}
OpenCLArray
grid1
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid1"
);
OpenCLArray
grid2
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid2"
);
grid1
.
upload
(
original
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
,
realToComplex
);
// Perform a forward FFT, then verify the result is correct.
...
...
@@ -80,9 +85,14 @@ void testTransform() {
fftpack_t
plan
;
fftpack_init_3d
(
&
plan
,
xsize
,
ysize
,
zsize
);
fftpack_exec_3d
(
plan
,
FFTPACK_FORWARD
,
&
reference
[
0
],
&
reference
[
0
]);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
ASSERT_EQUAL_TOL
(
reference
[
i
].
re
,
result
[
i
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
i
].
im
,
result
[
i
].
y
,
1e-3
);
int
outputZSize
=
(
realToComplex
?
zsize
/
2
+
1
:
zsize
);
for
(
int
x
=
0
;
x
<
xsize
;
x
++
)
for
(
int
y
=
0
;
y
<
ysize
;
y
++
)
for
(
int
z
=
0
;
z
<
outputZSize
;
z
++
)
{
int
index1
=
x
*
ysize
*
zsize
+
y
*
zsize
+
z
;
int
index2
=
x
*
ysize
*
outputZSize
+
y
*
outputZSize
+
z
;
ASSERT_EQUAL_TOL
(
reference
[
index1
].
re
,
result
[
index2
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
index1
].
im
,
result
[
index2
].
y
,
1e-3
);
}
fftpack_destroy
(
plan
);
...
...
@@ -91,7 +101,8 @@ void testTransform() {
fft
.
execFFT
(
grid2
,
grid1
,
false
);
grid1
.
download
(
result
);
double
scale
=
1.0
/
(
xsize
*
ysize
*
zsize
);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
int
valuesToCheck
=
(
realToComplex
?
original
.
size
()
/
2
:
original
.
size
());
for
(
int
i
=
0
;
i
<
valuesToCheck
;
++
i
)
{
ASSERT_EQUAL_TOL
(
original
[
i
].
x
,
scale
*
result
[
i
].
x
,
1e-4
);
ASSERT_EQUAL_TOL
(
original
[
i
].
y
,
scale
*
result
[
i
].
y
,
1e-4
);
}
...
...
@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"OpenCLPrecision"
,
string
(
argv
[
1
]));
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
testTransform
<
mm_double2
>
();
else
testTransform
<
mm_float2
>
();
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
{
testTransform
<
mm_double2
>
(
false
,
28
,
25
,
30
);
testTransform
<
mm_double2
>
(
true
,
28
,
25
,
25
);
testTransform
<
mm_double2
>
(
true
,
25
,
28
,
25
);
testTransform
<
mm_double2
>
(
true
,
25
,
25
,
28
);
testTransform
<
mm_double2
>
(
true
,
21
,
25
,
27
);
}
else
{
testTransform
<
mm_float2
>
(
false
,
28
,
25
,
30
);
testTransform
<
mm_float2
>
(
true
,
28
,
25
,
25
);
testTransform
<
mm_float2
>
(
true
,
25
,
28
,
25
);
testTransform
<
mm_float2
>
(
true
,
25
,
25
,
28
);
testTransform
<
mm_float2
>
(
true
,
21
,
25
,
27
);
}
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
View file @
59bccb15
...
...
@@ -7,7 +7,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-201
4
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -81,7 +81,7 @@ void testLargeSystem() {
const
int
numParticles
=
numMolecules
*
2
;
const
double
cutoff
=
2.0
;
const
double
boxSize
=
4.0
;
const
double
tolerance
=
5
;
const
double
tolerance
=
10
;
System
system
;
system
.
setDefaultPeriodicBoxVectors
(
Vec3
(
boxSize
,
0
,
0
),
Vec3
(
0
,
boxSize
,
0
),
Vec3
(
0
,
0
,
boxSize
));
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
...
...
@@ -114,7 +114,7 @@ void testLargeSystem() {
State
finalState
=
context
.
getState
(
State
::
Forces
|
State
::
Energy
|
State
::
Positions
);
ASSERT
(
finalState
.
getPotentialEnergy
()
<
initialState
.
getPotentialEnergy
());
// Compute the force magnitude, sub
s
tracting off any component parallel to a constraint, and
// Compute the force magnitude, subtracting off any component parallel to a constraint, and
// check that it satisfies the requested tolerance.
double
forceNorm
=
0.0
;
...
...
@@ -129,8 +129,8 @@ void testLargeSystem() {
f
-=
dir
*
dir
.
dot
(
f
);
forceNorm
+=
f
.
dot
(
f
);
}
forceNorm
=
sqrt
(
forceNorm
/
(
4
*
numMolecules
));
ASSERT
(
forceNorm
<
3
*
tolerance
);
forceNorm
=
sqrt
(
forceNorm
/
(
5
*
numMolecules
));
ASSERT
(
forceNorm
<
2
*
tolerance
);
}
void
testVirtualSites
()
{
...
...
@@ -138,7 +138,7 @@ void testVirtualSites() {
const
int
numParticles
=
numMolecules
*
3
;
const
double
cutoff
=
2.0
;
const
double
boxSize
=
4.0
;
const
double
tolerance
=
5
;
const
double
tolerance
=
10
;
System
system
;
system
.
setDefaultPeriodicBoxVectors
(
Vec3
(
boxSize
,
0
,
0
),
Vec3
(
0
,
boxSize
,
0
),
Vec3
(
0
,
0
,
boxSize
));
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
...
...
@@ -195,8 +195,8 @@ void testVirtualSites() {
ASSERT_EQUAL_VEC
((
finalState
.
getPositions
()[
i
+
1
]
+
finalState
.
getPositions
()[
i
])
*
0.5
,
finalState
.
getPositions
()[
i
+
2
],
1e-5
);
}
forceNorm
=
sqrt
(
forceNorm
/
(
4
*
numMolecules
));
ASSERT
(
forceNorm
<
3
*
tolerance
);
forceNorm
=
sqrt
(
forceNorm
/
(
5
*
numMolecules
));
ASSERT
(
forceNorm
<
2
*
tolerance
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
platforms/reference/src/SimTKReference/ObcParameters.cpp
View file @
59bccb15
...
...
@@ -382,9 +382,9 @@ void ObcParameters::setPeriodic(OpenMM::RealVec* vectors) {
assert
(
_cutoff
);
assert
(
boxSize
[
0
][
0
]
>=
2.0
*
_cutoffDistance
);
assert
(
boxSize
[
1
][
1
]
>=
2.0
*
_cutoffDistance
);
assert
(
boxSize
[
2
][
2
]
>=
2.0
*
_cutoffDistance
);
assert
(
vectors
[
0
][
0
]
>=
2.0
*
_cutoffDistance
);
assert
(
vectors
[
1
][
1
]
>=
2.0
*
_cutoffDistance
);
assert
(
vectors
[
2
][
2
]
>=
2.0
*
_cutoffDistance
);
_periodic
=
true
;
_periodicBoxVectors
[
0
]
=
vectors
[
0
];
...
...
platforms/reference/tests/TestReferenceCustomIntegrator.cpp
View file @
59bccb15
...
...
@@ -334,7 +334,7 @@ void testMonteCarlo() {
integrator
.
addComputePerDof
(
"oldx"
,
"x"
);
integrator
.
addComputePerDof
(
"x"
,
"x+dt*gaussian"
);
integrator
.
addComputeGlobal
(
"accept"
,
"step(exp((oldE-energy)/kT)-uniform)"
);
integrator
.
addComputePerDof
(
"x"
,
"
accept*x + (1-
accept
)*
oldx"
);
integrator
.
addComputePerDof
(
"x"
,
"
select(
accept
, x,
oldx
)
"
);
HarmonicBondForce
*
forceField
=
new
HarmonicBondForce
();
forceField
->
addBond
(
0
,
1
,
2.0
,
10.0
);
system
.
addForce
(
forceField
);
...
...
platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
View file @
59bccb15
...
...
@@ -7,7 +7,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-201
4
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -129,8 +129,8 @@ void testLargeSystem() {
f
-=
dir
*
dir
.
dot
(
f
);
forceNorm
+=
f
.
dot
(
f
);
}
forceNorm
=
sqrt
(
forceNorm
/
(
4
*
numMolecules
));
ASSERT
(
forceNorm
<
3
*
tolerance
);
forceNorm
=
sqrt
(
forceNorm
/
(
5
*
numMolecules
));
ASSERT
(
forceNorm
<
2
*
tolerance
);
}
void
testVirtualSites
()
{
...
...
@@ -195,8 +195,8 @@ void testVirtualSites() {
ASSERT_EQUAL_VEC
((
finalState
.
getPositions
()[
i
+
1
]
+
finalState
.
getPositions
()[
i
])
*
0.5
,
finalState
.
getPositions
()[
i
+
2
],
1e-5
);
}
forceNorm
=
sqrt
(
forceNorm
/
(
4
*
numMolecules
));
ASSERT
(
forceNorm
<
3
*
tolerance
);
forceNorm
=
sqrt
(
forceNorm
/
(
5
*
numMolecules
));
ASSERT
(
forceNorm
<
2
*
tolerance
);
}
int
main
()
{
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment