"vscode:/vscode.git/clone" did not exist on "66064facae82bbf98b55f43c80bd5db9652d069b"
Commit e7e6eebc authored by Yutong Zhao's avatar Yutong Zhao
Browse files

fixes fft thread divergence, makes hd 6xxx 5xxx cards work

parent d96606e2
......@@ -227,8 +227,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
}
else
else {
source<<"if (index < XSIZE*ZSIZE)\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
}
source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize);
......
......@@ -11,16 +11,20 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0)*BLOCKS_PER_GROUP+get_local_id(0)/ZSIZE; index < XSIZE*YSIZE; index += get_num_groups(0)*BLOCKS_PER_GROUP) {
for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
int index = baseIndex+get_local_id(0)/ZSIZE;
int x = index/YSIZE;
int y = index-x*YSIZE;
#if LOOP_REQUIRED
for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
#else
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
if (index < XSIZE*ZSIZE)
data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
#endif
barrier(CLK_LOCAL_MEM_FENCE);
COMPUTE_FFT
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment