Commit 6a04965a authored by zhanghj2's avatar zhanghj2
Browse files

去掉__syncthreads和分支

parent ae382f02
......@@ -55,12 +55,14 @@ __device__ __forceinline__ void warp_allreduce_(Tensor<Engine0, Layout0> &dst, T
// smem_reduce(row, col) = dst(0);
}
__syncthreads();
if (tidx < 16)
{
smem_reduce(row + 64) = op(op(smem_reduce(row * 4), smem_reduce(row * 4 + 1)), op(smem_reduce(row * 4 + 2), smem_reduce(row * 4 + 3)));
}
__syncthreads();
dst(0) = smem_reduce(row + 64);
// if (tidx < 16)
// {
// smem_reduce(row + 64) = op(op(smem_reduce(row * 4), smem_reduce(row * 4 + 1)), op(smem_reduce(row * 4 + 2), smem_reduce(row * 4 + 3)));
// }
// __syncthreads();
// dst(0) = smem_reduce(row + 64);
dst(0) = op(op(smem_reduce(row * 4), smem_reduce(row * 4 + 1)), op(smem_reduce(row * 4 + 2), smem_reduce(row * 4 + 3)));
}
template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment