Unverified Commit d89f825d authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

upd (#2336)

parent 92a3d07d
......@@ -106,8 +106,11 @@ __global__ void SDDMMCooTreeReduceKernel(
const Idx lhs_add = UseBcast ? __ldg(lhs_off + i) : i;
const Idx rhs_add = UseBcast ? __ldg(rhs_off + i) : i;
DType val = 0.;
for (int j = tx; j < reduce_size; j += 32)
for (int j = tx; j < reduce_size; j += 64) {
val += lhsoff[lhs_add * reduce_size + j] * rhsoff[rhs_add * reduce_size + j];
if (j + 32 < reduce_size)
val += lhsoff[lhs_add * reduce_size + j + 32] * rhsoff[rhs_add * reduce_size + j + 32];
}
#pragma unroll
for (int offset = 16; offset > 0; offset /= 2)
val += __shfl_down_sync(full_mask, val, offset);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment