Commit 374676c0 authored by Peter Eastman's avatar Peter Eastman
Browse files

Workaround for AMD OpenCL bug: FFT can work with a smaller number of threads per work group

parent ae36b287
......@@ -42,15 +42,16 @@ OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize
}
void OpenCLFFT3D::execFFT(OpenCLArray<mm_float2>& data, bool forward) {
int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
xkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
xkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(xkernel, xsize*ysize*zsize, xsize);
context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
ykernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
ykernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(ykernel, xsize*ysize*zsize, ysize);
context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
zkernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
zkernel.setArg<cl_float>(1, forward ? 1.0f : -1.0f);
context.executeKernel(zkernel, xsize*ysize*zsize, zsize);
context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
}
int OpenCLFFT3D::findLegalDimension(int minimum) {
......@@ -86,7 +87,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
if (unfactored%5 == 0) {
L = L/5;
source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -117,7 +118,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%4 == 0) {
L = L/4;
source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -138,7 +139,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%3 == 0) {
L = L/3;
source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -156,7 +157,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
else if (unfactored%2 == 0) {
L = L/2;
source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
source<<"if (i < "<<(L*m)<<") {\n";
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int j = i/"<<m<<";\n";
source<<"float2 c0 = data"<<input<<"[i];\n";
source<<"float2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
......@@ -175,7 +176,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int xmult,
// Create the kernel.
source<<"matrix[element] = data"<<(stage%2)<<"[i];\n";
source<<"for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))\n";
source<<"matrix[i*XMULT+y*YMULT+z*ZMULT] = data"<<(stage%2)<<"[i];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements;
replacements["XSIZE"] = OpenCLExpressionUtilities::intToString(xsize);
......
This diff is collapsed.
......@@ -7,13 +7,14 @@ float2 multiplyComplex(float2 c1, float2 c2) {
*/
__kernel void execFFT(__global float2* matrix, float sign, __local float2* w, __local float2* data0, __local float2* data1) {
const int i = get_local_id(0);
w[i] = (float2) (cos(-sign*i*2*M_PI/XSIZE), sin(-sign*i*2*M_PI/XSIZE));
for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))
w[i] = (float2) (cos(-sign*i*2*M_PI/XSIZE), sin(-sign*i*2*M_PI/XSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0); index < YSIZE*ZSIZE; index += get_num_groups(0)) {
int z = index/YSIZE;
int y = index-z*YSIZE;
int element = i*XMULT+y*YMULT+z*ZMULT;
data0[i] = matrix[element];
for (int i = get_local_id(0); i < XSIZE; i += get_local_size(0))
data0[i] = matrix[i*XMULT+y*YMULT+z*ZMULT];
barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment