"tests/vscode:/vscode.git/clone" did not exist on "7015417fd4910a47263ea34c79c2cdb2ff314fdf"
Commit f587d8f7 authored by zhuwenwen's avatar zhuwenwen
Browse files

update barrier_at_start and barrier_at_end

parent b0eacb5b
......@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1)
__atomic_store_n(curr_hdp_reg[i], 0x1, __ATOMIC_RELAXED);
}
}
start_sync<ngpus>(sg, self_sg, rank);
barrier_at_start<ngpus>(sg, self_sg, rank);
// do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
}
end_sync<ngpus, true>(sg, self_sg, rank);
barrier_at_end<ngpus, true>(sg, self_sg, rank);
}
template <typename T, int ngpus>
......@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
}
auto tmp_out = tmps[0];
start_sync<ngpus>(sg, self_sg, rank);
barrier_at_start<ngpus>(sg, self_sg, rank);
// stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
}
end_sync<ngpus>(sg, self_sg, rank);
barrier_at_end<ngpus>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment