Commit e7e6eebc authored by Yutong Zhao's avatar Yutong Zhao
Browse files

fixes fft thread divergence, makes hd 6xxx 5xxx cards work

parent d96606e2
...@@ -227,8 +227,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa ...@@ -227,8 +227,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n"; source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n"; source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
} }
else else {
source<<"if (index < XSIZE*ZSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n"; source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
}
source<<"barrier(CLK_GLOBAL_MEM_FENCE);"; source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements; map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize); replacements["XSIZE"] = context.intToString(xsize);
......
...@@ -11,16 +11,20 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric ...@@ -11,16 +11,20 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0)) for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE)); w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0)*BLOCKS_PER_GROUP+get_local_id(0)/ZSIZE; index < XSIZE*YSIZE; index += get_num_groups(0)*BLOCKS_PER_GROUP) {
for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int index = baseIndex+get_local_id(0)/ZSIZE;
int x = index/YSIZE; int x = index/YSIZE;
int y = index-x*YSIZE; int y = index-x*YSIZE;
#if LOOP_REQUIRED #if LOOP_REQUIRED
for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0)) for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z]; data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
#else #else
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE]; if (index < XSIZE*ZSIZE)
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT COMPUTE_FFT
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment