Commit 374676c0 authored by Peter Eastman's avatar Peter Eastman
Browse files

Workaround for AMD OpenCL bug: FFT can work with a smaller number of threads per work group

parent ae36b287
...@@ -42,15 +42,16 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize ...@@ -42,15 +42,16 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
} }
void OpenCLFFT3D::execFFT(OpenCLArray<mm_float2>& data, bool forward) { void OpenCLFFT3D::execFFT(OpenCLArray<mm_float2>& data, bool forward) {
int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
xkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); xkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
xkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f); xkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(xkernel, xsize*ysize*zsize, xsize); context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
ykernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); ykernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
ykernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f); ykernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(ykernel, xsize*ysize*zsize, ysize); context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
zkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer()); zkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
zkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f); zkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(zkernel, xsize*ysize*zsize, zsize); context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
} }
int OpenCLFFT3D::findLegalDimension(int minimum) { int OpenCLFFT3D::findLegalDimension(int minimum) {
...@@ -86,7 +87,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult, ...@@ -86,7 +87,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
if (unfactored%5 == 0) { if (unfactored%5 == 0) {
L = L/5; L = L/5;
source<<"// Pass "<<(stage+1)<<" (radix 5)\n"; source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
source<<"if (i < "<<(L*m)<<") {\n"; source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
...@@ -117,7 +118,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult, ...@@ -117,7 +118,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%4 == 0) { else if (unfactored%4 == 0) {
L = L/4; L = L/4;
source<<"// Pass "<<(stage+1)<<" (radix 4)\n"; source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
source<<"if (i < "<<(L*m)<<") {\n"; source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
...@@ -138,7 +139,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult, ...@@ -138,7 +139,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%3 == 0) { else if (unfactored%3 == 0) {
L = L/3; L = L/3;
source<<"// Pass "<<(stage+1)<<" (radix 3)\n"; source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
source<<"if (i < "<<(L*m)<<") {\n"; source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
...@@ -156,7 +157,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult, ...@@ -156,7 +157,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%2 == 0) { else if (unfactored%2 == 0) {
L = L/2; L = L/2;
source<<"// Pass "<<(stage+1)<<" (radix 2)\n"; source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
source<<"if (i < "<<(L*m)<<") {\n"; source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n"; source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n"; source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n"; source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
...@@ -175,7 +176,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult, ...@@ -175,7 +176,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
// Create the kernel. // Create the kernel.
source<<"matrix[element] = data"<<(stage%2)<<"[i];\n"; source<<"for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))\n";
source<<"matrix[i*XMULT+y*YMULT+z*ZMULT] = data"<<(stage%2)<<"[i];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);"; source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize); replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize);
......
This diff is collapsed.
...@@ -7,13 +7,14 @@ float2 multiplyComplex(float2 c1, float2 c2) { ...@@ -7,13 +7,14 @@ float2 multiplyComplex(float2 c1, float2 c2) {
*/ */
__kernel void execFFT(__global float2* matrix, float sign, __local float2* w, __local float2* data0, __local float2* data1) { __kernel void execFFT(__global float2* matrix, float sign, __local float2* w, __local float2* data0, __local float2* data1) {
const int i = get_local_id(0); for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))
w[i] = (float2) (cos(-sign*i*2*M_PI/XSIZE), sin(-sign*i*2*M_PI/XSIZE)); w[i] = (float2) (cos(-sign*i*2*M_PI/XSIZE), sin(-sign*i*2*M_PI/XSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0); index < YSIZE*ZSIZE; index += get_num_groups(0)) { for (int index = get_group_id(0); index < YSIZE*ZSIZE; index += get_num_groups(0)) {
int z = index/YSIZE; int z = index/YSIZE;
int y = index-z*YSIZE; int y = index-z*YSIZE;
int element = i*XMULT+y*YMULT+z*ZMULT; for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))
data0[i] = matrix[element]; data0[i] = matrix[i*XMULT+y*YMULT+z*ZMULT];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT COMPUTE_FFT
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment