[BUG FIX] sync torch stream before nccl send/recv

2bd187cb · zms1999 · ff28081c · 2bd187cb
Commit 2bd187cb authored Sep 02, 2023 by zms1999
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

cuda/fastermoe/smart_schedule.h cuda/fastermoe/smart_schedule.h +2 -0

No files found.
--- a/cuda/fastermoe/smart_schedule.h
+++ b/cuda/fastermoe/smart_schedule.h
@@ -123,6 +123,7 @@ void fmoe_cuda_fused_forward_impl(
        long num_expert, long rank, long world_size, long expert_size,
        long pipeline_gran, CudaStreamManager* smgr) {
    auto torch_stream = c10::cuda::getCurrentCUDAStream().stream();
+    cudaStreamSynchronize(torch_stream);
    int *local_ptr = new int[num_expert * world_size + 1];
    int *global_ptr = new int[num_expert * world_size + 1];
@@ -282,6 +283,7 @@ void fmoe_cuda_fused_backward_impl(
        long num_expert, long rank, long world_size,
        long pipeline_gran, CudaStreamManager* smgr) {
    auto torch_stream = c10::cuda::getCurrentCUDAStream().stream();
+    cudaStreamSynchronize(torch_stream);
    int *local_ptr = new int[num_expert * world_size + 1];
    int *global_ptr = new int[num_expert * world_size + 1];