Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
374676c0
Commit
374676c0
authored
Apr 20, 2010
by
Peter Eastman
Browse files
Workaround for AMD OpenCL bug: FFT can work with a smaller number of threads per work group
parent
ae36b287
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6746 additions
and
5835 deletions
+6746
-5835
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+10
-8
platforms/opencl/src/cl.hpp
platforms/opencl/src/cl.hpp
+6731
-5823
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+5
-4
No files found.
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
374676c0
...
@@ -42,15 +42,16 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
...
@@ -42,15 +42,16 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
}
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
<
mm_float2
>&
data
,
bool
forward
)
{
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
<
mm_float2
>&
data
,
bool
forward
)
{
int
maxSize
=
xkernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
xkernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xsize
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
min
(
xsize
,
(
int
)
maxSize
)
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
ykernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
ysize
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
min
(
ysize
,
(
int
)
maxSize
)
);
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
zkernel
.
setArg
<
cl_float
>
(
1
,
forward
?
1.0
f
:
-
1.0
f
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zsize
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
min
(
zsize
,
(
int
)
maxSize
)
);
}
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
@@ -86,7 +87,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
...
@@ -86,7 +87,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
if
(
unfactored
%
5
==
0
)
{
if
(
unfactored
%
5
==
0
)
{
L
=
L
/
5
;
L
=
L
/
5
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 5)
\n
"
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 5)
\n
"
;
source
<<
"
if (i < "
<<
(
L
*
m
)
<<
"
) {
\n
"
;
source
<<
"
for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)
) {
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
...
@@ -117,7 +118,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
...
@@ -117,7 +118,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else
if
(
unfactored
%
4
==
0
)
{
else
if
(
unfactored
%
4
==
0
)
{
L
=
L
/
4
;
L
=
L
/
4
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 4)
\n
"
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 4)
\n
"
;
source
<<
"
if (i < "
<<
(
L
*
m
)
<<
"
) {
\n
"
;
source
<<
"
for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)
) {
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
...
@@ -138,7 +139,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
...
@@ -138,7 +139,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else
if
(
unfactored
%
3
==
0
)
{
else
if
(
unfactored
%
3
==
0
)
{
L
=
L
/
3
;
L
=
L
/
3
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 3)
\n
"
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 3)
\n
"
;
source
<<
"
if (i < "
<<
(
L
*
m
)
<<
"
) {
\n
"
;
source
<<
"
for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)
) {
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
...
@@ -156,7 +157,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
...
@@ -156,7 +157,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else
if
(
unfactored
%
2
==
0
)
{
else
if
(
unfactored
%
2
==
0
)
{
L
=
L
/
2
;
L
=
L
/
2
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 2)
\n
"
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 2)
\n
"
;
source
<<
"
if (i < "
<<
(
L
*
m
)
<<
"
) {
\n
"
;
source
<<
"
for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)
) {
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"float2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
...
@@ -175,7 +176,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
...
@@ -175,7 +176,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
// Create the kernel.
// Create the kernel.
source
<<
"matrix[element] = data"
<<
(
stage
%
2
)
<<
"[i];
\n
"
;
source
<<
"for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))
\n
"
;
source
<<
"matrix[i*XMULT+y*YMULT+z*ZMULT] = data"
<<
(
stage
%
2
)
<<
"[i];
\n
"
;
source
<<
"barrier(CLK_GLOBAL_MEM_FENCE);"
;
source
<<
"barrier(CLK_GLOBAL_MEM_FENCE);"
;
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
xsize
);
replacements
[
"XSIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
xsize
);
...
...
platforms/opencl/src/cl.hpp
View file @
374676c0
This source diff could not be displayed because it is too large. You can
view the blob
instead.
platforms/opencl/src/kernels/fft.cl
View file @
374676c0
...
@@ -7,13 +7,14 @@ float2 multiplyComplex(float2 c1, float2 c2) {
...
@@ -7,13 +7,14 @@ float2 multiplyComplex(float2 c1, float2 c2) {
*/
*/
__kernel
void
execFFT
(
__global
float2*
matrix,
float
sign,
__local
float2*
w,
__local
float2*
data0,
__local
float2*
data1
)
{
__kernel
void
execFFT
(
__global
float2*
matrix,
float
sign,
__local
float2*
w,
__local
float2*
data0,
__local
float2*
data1
)
{
const
int
i
=
get_local_id
(
0
)
;
for
(
int
i
=
get_local_id
(
0
)
;
i < XSIZE; i += get_local_size(0))
w[i]
=
(
float2
)
(
cos
(
-sign*i*2*M_PI/XSIZE
)
,
sin
(
-sign*i*2*M_PI/XSIZE
))
;
w[i]
=
(
float2
)
(
cos
(
-sign*i*2*M_PI/XSIZE
)
,
sin
(
-sign*i*2*M_PI/XSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
index
=
get_group_id
(
0
)
; index < YSIZE*ZSIZE; index += get_num_groups(0)) {
for
(
int
index
=
get_group_id
(
0
)
; index < YSIZE*ZSIZE; index += get_num_groups(0)) {
int
z
=
index/YSIZE
;
int
z
=
index/YSIZE
;
int
y
=
index-z*YSIZE
;
int
y
=
index-z*YSIZE
;
int
element
=
i*XMULT+y*YMULT+z*ZMULT
;
for
(
int
i
=
get_local_id
(
0
)
; i < XSIZE; i += get_local_size(0))
data0[i]
=
matrix[
element
]
;
data0[i]
=
matrix[
i*XMULT+y*YMULT+z*ZMULT
]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
COMPUTE_FFT
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment