update barrier_at_start and barrier_at_end

f587d8f7 · zhuwenwen · b0eacb5b · f587d8f7
Commit f587d8f7 authored Apr 18, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

csrc/custom_all_reduce.cuh csrc/custom_all_reduce.cuh +4 -4

No files found.
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1)
      __atomic_store_n(curr_hdp_reg[i], 0x1, __ATOMIC_RELAXED);
    }
  }
-  start_sync<ngpus>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
  }
-  end_sync<ngpus, true>(sg, self_sg, rank);
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
 }
 template <typename T, int ngpus>
@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  start_sync<ngpus>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  end_sync<ngpus>(sg, self_sg, rank);
+  barrier_at_end<ngpus>(sg, self_sg, rank);
  // stage 2: allgather. Note: it's important to match the tid between
  // the two stages, because visibility across devices is only guaranteed