Unverified Commit 98522149 authored by yizhang2077's avatar yizhang2077 Committed by GitHub
Browse files

mirror fix for custom allreduce (#3124)

parent 5d9d15e7
......@@ -160,7 +160,7 @@ __inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag
}
template <typename T, int RANKS_PER_NODE, bool COPY_INPUT = true>
static __global__ void oneShotAllReduceKernel(AllReduceParams params) {
static __global__ void __launch_bounds__(512, 1) oneShotAllReduceKernel(AllReduceParams params) {
// Suppose that two GPUs participate in the AR exchange, and we start four blocks.
// The message is partitioned into chunks as detailed below:
// message
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment