printf("stuck waiting for my buddy, thread %d myvalue %d data offset %d flag offset %d read value %d\n",threadIdx.x,magic,4*THREADS_PER_PIXEL+off*ELEMENTS_PER_LDG*THREADS_PER_PIXEL+ELEMENTS_PER_LDG*threadIdx.x,off*THREADS_PER_PIXEL,read_flag[threadIdx.x]);
safety=0;
}
}
#else
while((read_flag[threadIdx.x])!=(magic));
#endif
// now each CTA (on each GPU) reads the data written by CTA 0 of the other GPU