fix combine timeout due to delayed forwarder min head update (#353)

* fix combine timeout due to forwarder min head update * Update head before and after combine_token; add assertion for nvl_buffer_size_per_rdma_rank --------- Co-authored-by: zhiyi Hu <zhiyihu@U-NYQQMGK0-2250.local>

fix combine timeout due to delayed forwarder min head update (#353)
* fix combine timeout due to forwarder min head update * Update head before and after combine_token; add assertion for nvl_buffer_size_per_rdma_rank --------- Co-authored-by: zhiyi Hu <zhiyihu@U-NYQQMGK0-2250.local>
1da73be0 · Zhiyi Hu · GitHub · ab484794 · 1da73be0
Unverified Commit 1da73be0 authored Aug 26, 2025 by Zhiyi Hu Committed by GitHub Aug 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

csrc/kernels/internode.cu csrc/kernels/internode.cu +4 -1

No files found.
--- a/csrc/kernels/internode.cu
+++ b/csrc/kernels/internode.cu
@@ -1647,8 +1647,10 @@ combine(int4* combined_x, float* combined_topk_weights,
                    // Read expected head
                    EP_STATIC_ASSERT(kNumRDMARanks <= 32, "Invalid number of RDMA peers");
                    int expected_head = -1;
-                    if (lane_id < NUM_MAX_NVL_PEERS)
+                    if (lane_id < NUM_MAX_NVL_PEERS) {
                        expected_head = ld_nc_global(combined_nvl_head + token_idx * NUM_MAX_NVL_PEERS + lane_id);
+                        expected_head < 0 ? (forwarder_nvl_head[warp_id][lane_id] = -expected_head - 1) : (forwarder_nvl_head[warp_id][lane_id] = expected_head);
+                    }
                    // Wait lanes to be ready
                    start_time = clock64();
@@ -1851,6 +1853,7 @@ void combine(cudaDataType_t type,
    EP_HOST_ASSERT(num_forwarder_warps > NUM_MAX_NVL_PEERS and num_forwarder_warps % num_rdma_ranks == 0);
    EP_HOST_ASSERT(num_max_nvl_chunked_recv_tokens % num_rdma_ranks == 0);
    EP_HOST_ASSERT(num_max_nvl_chunked_recv_tokens / num_rdma_ranks > std::max(num_max_rdma_chunked_send_tokens, num_max_nvl_chunked_send_tokens));
+    EP_HOST_ASSERT(num_max_nvl_chunked_recv_tokens / num_rdma_ranks - num_warps_per_forwarder >= num_max_nvl_chunked_send_tokens);
    EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens >= num_warps_per_forwarder);
    EP_HOST_ASSERT(type == CUDA_R_16BF);