Commit f587d8f7 authored by zhuwenwen's avatar zhuwenwen
Browse files

update barrier_at_start and barrier_at_end

parent b0eacb5b
...@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1) ...@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1)
__atomic_store_n(curr_hdp_reg[i], 0x1, __ATOMIC_RELAXED); __atomic_store_n(curr_hdp_reg[i], 0x1, __ATOMIC_RELAXED);
} }
} }
start_sync<ngpus>(sg, self_sg, rank); barrier_at_start<ngpus>(sg, self_sg, rank);
// do the actual reduction // do the actual reduction
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += gridDim.x * blockDim.x) { idx += gridDim.x * blockDim.x) {
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx); ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
} }
end_sync<ngpus, true>(sg, self_sg, rank); barrier_at_end<ngpus, true>(sg, self_sg, rank);
} }
template <typename T, int ngpus> template <typename T, int ngpus>
...@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1) ...@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1)
tmps[i] = get_tmp_buf<P>(sg.signals[target]); tmps[i] = get_tmp_buf<P>(sg.signals[target]);
} }
auto tmp_out = tmps[0]; auto tmp_out = tmps[0];
start_sync<ngpus>(sg, self_sg, rank); barrier_at_start<ngpus>(sg, self_sg, rank);
// stage 1: reduce scatter // stage 1: reduce scatter
for (int idx = start + tid; idx < end; idx += stride) { for (int idx = start + tid; idx < end; idx += stride) {
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx); tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
} }
end_sync<ngpus>(sg, self_sg, rank); barrier_at_end<ngpus>(sg, self_sg, rank);
// stage 2: allgather. Note: it's important to match the tid between // stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed // the two stages, because visibility across devices is only guaranteed
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment