Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e531310f
Commit
e531310f
authored
Apr 06, 2015
by
peastman
Browse files
Merge pull request #869 from peastman/fft
OpenCL FFT supports real-to-complex transforms
parents
f2a9c210
99ed3ba7
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
108 additions
and
66 deletions
+108
-66
platforms/opencl/include/OpenCLFFT3D.h
platforms/opencl/include/OpenCLFFT3D.h
+10
-6
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+39
-29
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+15
-5
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+10
-2
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+15
-16
platforms/opencl/tests/TestOpenCLFFT.cpp
platforms/opencl/tests/TestOpenCLFFT.cpp
+19
-8
No files found.
platforms/opencl/include/OpenCLFFT3D.h
View file @
e531310f
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -40,7 +40,7 @@ namespace OpenMM {
* 15, 207–228 (2000).
* <p>
* This class places certain restrictions on the allowed dimensions of the grid. First,
* the size of each dimension may have no prime factors other than 2, 3, and
5
. You
* the size of each dimension may have no prime factors other than 2, 3,
5,
and
7
. You
* can call findLegalDimension() to determine the smallest size that satisfies this
* requirement and is greater than or equal to a specified minimum size. Second, the size
* of each dimension must be small enough to compute each 1D transform entirely in local
...
...
@@ -61,12 +61,14 @@ public:
* @param xsize the first dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
);
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
=
false
);
/**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
* arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed.
* are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform.
*
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data
...
...
@@ -75,17 +77,19 @@ public:
void
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
=
true
);
/**
* Get the smallest legal size for a dimension of the grid (that is, a size with no prime
* factors other than 2, 3, and
5
).
* factors other than 2, 3,
5,
and
7
).
*
* @param minimum the minimum size the return value must be greater than or equal to
*/
static
int
findLegalDimension
(
int
minimum
);
private:
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
);
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
);
int
xsize
,
ysize
,
zsize
;
int
xthreads
,
ythreads
,
zthreads
;
bool
realToComplex
;
OpenCLContext
&
context
;
cl
::
Kernel
xkernel
,
ykernel
,
zkernel
;
cl
::
Kernel
invxkernel
,
invykernel
,
invzkernel
;
};
}
// namespace OpenMM
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
e531310f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -35,25 +35,29 @@
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
);
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
),
realToComplex
(
realToComplex
)
{
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
,
0
,
true
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
,
1
,
true
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
,
2
,
true
);
invzkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
,
0
,
false
);
invxkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
,
1
,
false
);
invykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
,
2
,
false
);
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
()
);
zkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
()
);
zkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zthreads
);
x
kernel
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
()
);
x
kernel
.
setArg
<
cl
_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xthreads
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
()
);
y
kernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
y
kernel
.
setArg
<
cl
_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
y
kernel
,
xsize
*
ysize
*
zsize
,
ythreads
);
cl
::
Kernel
kernel1
=
(
forward
?
zkernel
:
invzkernel
);
cl
::
Kernel
kernel2
=
(
forward
?
xkernel
:
invxkernel
);
cl
::
Kernel
kernel3
=
(
forward
?
ykernel
:
invykernel
);
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
()
);
kernel
1
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
xsize
*
ysize
*
zsize
,
zthreads
);
kernel
2
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
()
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
()
);
context
.
executeKernel
(
kernel2
,
xsize
*
ysize
*
zsize
,
xthreads
);
kernel
3
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel
3
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
()
);
context
.
executeKernel
(
kernel
3
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
...
@@ -73,7 +77,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
)
{
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
)
{
int
maxThreads
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
bool
isCPU
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
;
while
(
true
)
{
...
...
@@ -137,10 +141,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
sign
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 b5 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
(SIGN)
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
...
...
@@ -178,8 +182,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 =
sign
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
sign
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"real2 d9 =
(SIGN)
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
(SIGN)
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
...
...
@@ -194,7 +198,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 =
sign
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"real2 d3 =
(SIGN)
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
...
...
@@ -206,7 +210,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 =
sign
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"real2 d2 =
(SIGN)
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
...
...
@@ -226,13 +230,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
bool
outputIsReal
=
(
realToComplex
&&
axis
==
2
&&
!
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
)
{
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z];
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z]
"
<<
outputSuffix
<<
"
;
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]
"
<<
outputSuffix
<<
"
;
\n
"
;
}
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
...
...
@@ -242,6 +248,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
realToComplex
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
realToComplex
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
...
...
@@ -253,9 +263,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue
;
}
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
kernel
.
setArg
(
2
,
bufferSize
,
NULL
);
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
5
,
bufferSize
,
NULL
);
return
kernel
;
}
}
platforms/opencl/src/OpenCLKernels.cpp
View file @
e531310f
...
...
@@ -1652,8 +1652,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
cl
.
addAutoclearBuffer
(
*
pmeGrid
);
pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(*pmeGrid2);
else
cl.addAutoclearBuffer(*pmeGrid);
pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...
...
@@ -1661,9 +1664,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ
, true
);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
usePmeQueue
=
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
);
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
if (isNvidia)
pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
usePmeQueue = isNvidia;
if (usePmeQueue) {
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
int recipForceGroup = force.getReciprocalSpaceForceGroup();
...
...
@@ -1814,6 +1820,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
...
...
@@ -1827,7 +1836,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
}
}
}
...
...
platforms/opencl/src/kernels/fft.cl
View file @
e531310f
...
...
@@ -6,10 +6,10 @@ real2 multiplyComplex(real2 c1, real2 c2) {
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*/
__kernel
void
execFFT
(
__global
const
real2
*
restrict
in,
__global
real2
*
restrict
out,
int
sign,
__local
real2*
restrict
w,
__kernel
void
execFFT
(
__global
const
INPUT_TYPE
*
restrict
in,
__global
OUTPUT_TYPE
*
restrict
out,
__local
real2*
restrict
w,
__local
real2*
restrict
data0,
__local
real2*
restrict
data1
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
-
sign
*i*2*M_PI/ZSIZE
)
,
sin
(
-
sign
*i*2*M_PI/ZSIZE
))
;
w[i]
=
(
real2
)
(
cos
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
)
,
sin
(
-
(
SIGN
)
*i*2*
M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
baseIndex
=
get_group_id
(
0
)
*BLOCKS_PER_GROUP
; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
...
...
@@ -18,10 +18,18 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
int
y
=
index-x*YSIZE
;
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
#
if
INPUT_IS_REAL
data0[z]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z],
0
)
;
#
else
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
endif
#
else
if
(
index
<
XSIZE*YSIZE
)
#
if
INPUT_IS_REAL
data0[get_local_id
(
0
)
]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE],
0
)
;
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE]
;
#
endif
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
...
...
platforms/opencl/src/kernels/pme.cl
View file @
e531310f
...
...
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
add
=
pos.w*data[ix].x*data[iy].y*data[iz].z
;
#
ifdef
USE_DOUBLE_PRECISION
atom_add
(
&pmeGrid[2*index],
(
long
)
(
add*0x100000000
))
;
#
ifdef
USE_ALTERNATE_MEMORY_ACCESS_PATTERN
//
On
Nvidia
devices
(
at
least
Maxwell
anyway
)
,
this
split
ordering
produces
much
higher
performance.
Why?
//
I
have
no
idea!
And
of
course
on
AMD
it
produces
slower
performance.
GPUs
are
not
meant
to
be
understood.
atom_add
(
&pmeGrid[index%2
==
0
?
index/2
:
(
index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
)
/2],
(
long
)
(
add*0x100000000
))
;
#
else
atom_add
(
&pmeGrid[index],
(
long
)
(
add*0x100000000
))
;
#
endif
}
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
pmeGrid
)
{
__global
real2*
realGrid
=
(
__global
real2*
)
pmeGrid
;
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
fixedGrid,
__global
real*
restrict
realGrid
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
real
scale
=
EPSILON_FACTOR/
(
real
)
0x100000000
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_
DOUBLE_PRECISIO
N
long
value
=
pmeGrid[2*index
]
;
#
ifdef
USE_
ALTERNATE_MEMORY_ACCESS_PATTER
N
long
value
=
fixedGrid[index%2
==
0
?
index/2
:
(
index+gridSize
)
/2
]
;
#
else
long
value
=
pme
Grid[index]
;
long
value
=
fixed
Grid[index]
;
#
endif
real2
realValue
=
(
real2
)
((
real
)
(
value*scale
)
,
0
)
;
realGrid[index]
=
realValue
;
realGrid[index]
=
(
real
)
(
value*scale
)
;
}
}
#
elif
defined
(
DEVICE_IS_CPU
)
__kernel
void
gridSpreadCharge
(
__global
const
real4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
__global
real*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ
)
{
const
int
firstx
=
get_global_id
(
0
)
*GRID_SIZE_X/get_global_size
(
0
)
;
const
int
lastx
=
(
get_global_id
(
0
)
+1
)
*GRID_SIZE_X/get_global_size
(
0
)
;
if
(
firstx
==
lastx
)
...
...
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
pmeGrid[index]
.x
+=
EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z
;
pmeGrid[index]
+=
EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z
;
}
}
}
...
...
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
#
else
__kernel
void
gridSpreadCharge
(
__global
const
real4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
real
2
*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta
)
{
__global
real*
restrict
pmeGrid,
__global
const
real4*
restrict
pmeBsplineTheta
)
{
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
//
Compute
the
charge
on
a
grid
point.
...
...
@@ -290,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
}
}
pmeGrid[gridIndex]
=
(
real2
)
(
result*EPSILON_FACTOR
,
0
)
;
pmeGrid[gridIndex]
=
result*EPSILON_FACTOR
;
}
}
#
endif
...
...
@@ -325,7 +324,7 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real
2
*
restrict
pmeGrid,
__kernel
void
gridInterpolateForce
(
__global
const
real4*
restrict
posq,
__global
real4*
restrict
forceBuffers,
__global
const
real*
restrict
pmeGrid,
real4
periodicBoxSize,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
__global
int2*
restrict
pmeAtomGridIndex
)
{
const
real4
scale
=
1/
(
real
)
(
PME_ORDER-1
)
;
real4
data[PME_ORDER]
;
...
...
@@ -385,7 +384,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
real
gridvalue
=
pmeGrid[index]
.x
;
real
gridvalue
=
pmeGrid[index]
;
force.x
+=
ddata[ix].x*data[iy].y*data[iz].z*gridvalue
;
force.y
+=
data[ix].x*ddata[iy].y*data[iz].z*gridvalue
;
force.z
+=
data[ix].x*data[iy].y*ddata[iz].z*gridvalue
;
...
...
platforms/opencl/tests/TestOpenCLFFT.cpp
View file @
e531310f
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -51,7 +51,7 @@ using namespace std;
static
OpenCLPlatform
platform
;
template
<
class
Real2
>
void
testTransform
()
{
void
testTransform
(
bool
realToComplex
)
{
System
system
;
system
.
addParticle
(
0.0
);
OpenCLPlatform
::
PlatformData
platformData
(
system
,
""
,
""
,
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
),
"false"
);
...
...
@@ -67,10 +67,16 @@ void testTransform() {
original
[
i
]
=
value
;
reference
[
i
]
=
t_complex
(
value
.
x
,
value
.
y
);
}
for
(
int
i
=
0
;
i
<
(
int
)
reference
.
size
();
i
++
)
{
if
(
realToComplex
)
reference
[
i
]
=
t_complex
(
i
%
2
==
0
?
original
[
i
/
2
].
x
:
original
[
i
/
2
].
y
,
0
);
else
reference
[
i
]
=
t_complex
(
original
[
i
].
x
,
original
[
i
].
y
);
}
OpenCLArray
grid1
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid1"
);
OpenCLArray
grid2
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid2"
);
grid1
.
upload
(
original
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
);
OpenCLFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
,
realToComplex
);
// Perform a forward FFT, then verify the result is correct.
...
...
@@ -91,7 +97,8 @@ void testTransform() {
fft
.
execFFT
(
grid2
,
grid1
,
false
);
grid1
.
download
(
result
);
double
scale
=
1.0
/
(
xsize
*
ysize
*
zsize
);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
int
valuesToCheck
=
(
realToComplex
?
original
.
size
()
/
2
:
original
.
size
());
for
(
int
i
=
0
;
i
<
valuesToCheck
;
++
i
)
{
ASSERT_EQUAL_TOL
(
original
[
i
].
x
,
scale
*
result
[
i
].
x
,
1e-4
);
ASSERT_EQUAL_TOL
(
original
[
i
].
y
,
scale
*
result
[
i
].
y
,
1e-4
);
}
...
...
@@ -101,10 +108,14 @@ int main(int argc, char* argv[]) {
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"OpenCLPrecision"
,
string
(
argv
[
1
]));
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
testTransform
<
mm_double2
>
();
else
testTransform
<
mm_float2
>
();
if
(
platform
.
getPropertyDefaultValue
(
"OpenCLPrecision"
)
==
"double"
)
{
testTransform
<
mm_double2
>
(
false
);
testTransform
<
mm_double2
>
(
true
);
}
else
{
testTransform
<
mm_float2
>
(
false
);
testTransform
<
mm_float2
>
(
true
);
}
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment