Merge branch 'fix_cached_notify' into 'ci-release'

fix cached_notify err when sm greater than 32. See merge request dcutoolkit/deeplearing/DeepEP!38

Merge branch 'fix_cached_notify' into 'ci-release'
fix cached_notify err when sm greater than 32. See merge request dcutoolkit/deeplearing/DeepEP!38
4276b979 · lijian6 · e45581db · 4b8d4b15 · 4276b979
Commit 4276b979 authored Jun 05, 2026 by lijian6
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 51 deletions

csrc/kernels/internode.cu csrc/kernels/internode.cu +48 -51

No files found.
--- a/csrc/kernels/internode.cu
+++ b/csrc/kernels/internode.cu
@@ -1213,63 +1213,60 @@ cached_notify(const int rdma_clean_offset, const int rdma_num_int_clean, const i
        // Barrier again
        barrier_block<NUM_MAX_NVL_PEERS>(barrier_signal_ptrs, nvl_rank);
-    } else if (sm_id == 1) {
+    } else { 
        if (is_cached_dispatch)
            return;
-        EP_DEVICE_ASSERT(num_warps >= num_channels);
+        constexpr int num_clean_sms = 1;
-        EP_DEVICE_ASSERT(num_rdma_ranks <= kWarpSize);
+        const int logical_block = sm_id - num_clean_sms;
+        const int total_blocks  = gridDim.x - num_clean_sms;
-        // Iterate in reverse order
-        if (lane_id < num_rdma_ranks and warp_id < num_channels) {
+        if (logical_block < 0) return;
-            int token_start_idx, token_end_idx;
+        if (combined_rdma_head != nullptr) {
-            get_channel_task_range(num_combined_tokens, num_channels, warp_id, token_start_idx,
+            EP_DEVICE_ASSERT(num_rdma_ranks <= kWarpSize);
-                                   token_end_idx);
+            for (int chan = logical_block; chan < num_channels; chan += total_blocks) {
+                int token_start_idx, token_end_idx;
-            // NOTES: `1 << 25` is a heuristic large number
+                get_channel_task_range(num_combined_tokens, num_channels, chan, token_start_idx, token_end_idx);
-            int last_head = 1 << 25;
-            for (int token_idx = token_end_idx - 1; token_idx >= token_start_idx; --token_idx) {
+                for (int token_idx = token_end_idx - 1 - warp_id; token_idx >= token_start_idx; token_idx -= num_warps) {
-                auto current_head =
+                    int last_head = 1 << 25;
-                    __ldg(combined_rdma_head + token_idx * num_rdma_ranks + lane_id);
+                    if (lane_id < num_rdma_ranks) {
-                if (current_head < 0) {
+                        auto ptr = combined_rdma_head + token_idx * num_rdma_ranks + lane_id;
-                    combined_rdma_head[token_idx * num_rdma_ranks + lane_id] = -last_head - 1;
+                        int current_head = __ldg(ptr);
-                } else {
+                        if (current_head < 0) {
-                    last_head = current_head;
+                            *ptr = -last_head - 1;
+                        } else {
+                            last_head = current_head;
+                        }
+                    }
                }
            }
        }
-    } else {
+        if (combined_nvl_head != nullptr) {
-        if (is_cached_dispatch)
+            EP_DEVICE_ASSERT(rdma_channel_prefix_matrix != nullptr);
-            return;
+            EP_DEVICE_ASSERT(rdma_rank_prefix_sum != nullptr);
+            EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS <= kWarpSize, "Too many NVL peers");
-        EP_DEVICE_ASSERT(num_warps >= num_channels);
-        EP_DEVICE_ASSERT(rdma_channel_prefix_matrix != nullptr and
+            for (int chan = logical_block; chan < num_channels; chan += total_blocks) {
-                                  rdma_rank_prefix_sum != nullptr);
+                for (int dst_rdma_rank = 0; dst_rdma_rank < num_rdma_ranks; ++dst_rdma_rank) {
-        EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS <= kWarpSize, "Too many NVL peers");
+                    int token_start_idx = (chan == 0) ? 0 : rdma_channel_prefix_matrix[dst_rdma_rank * num_channels + chan - 1];
-        constexpr int num_clean_sms = 2;
+                    int token_end_idx = rdma_channel_prefix_matrix[dst_rdma_rank * num_channels + chan];
+                    int shift = (dst_rdma_rank == 0) ? 0 : rdma_rank_prefix_sum[dst_rdma_rank - 1];
-        if (lane_id < NUM_MAX_NVL_PEERS and warp_id < num_channels) {
+                    token_start_idx += shift;
-            for (int dst_rdma_rank = sm_id - num_clean_sms; dst_rdma_rank < num_rdma_ranks;
+                    token_end_idx   += shift;
-                 dst_rdma_rank += num_channels * 2 - num_clean_sms) {
-                // Iterate in reverse order
+                    for (int token_idx = token_end_idx - 1 - warp_id; token_idx >= token_start_idx; token_idx -= num_warps) {
-                int token_start_idx =
+                        int last_head = 1 << 25;
-                    warp_id == 0
+                        if (lane_id < NUM_MAX_NVL_PEERS) {
-                        ? 0
+                            auto ptr = combined_nvl_head +
-                        : rdma_channel_prefix_matrix[dst_rdma_rank * num_channels + warp_id - 1];
+                                       token_idx * NUM_MAX_NVL_PEERS + lane_id;
-                int token_end_idx =
+                            int current_head = __ldg(ptr);
-                    rdma_channel_prefix_matrix[dst_rdma_rank * num_channels + warp_id];
+                            if (current_head < 0) {
-                int shift = dst_rdma_rank == 0 ? 0 : rdma_rank_prefix_sum[dst_rdma_rank - 1];
+                                *ptr = -last_head - 1;
-                token_start_idx += shift, token_end_idx += shift;
+                            } else {
+                                last_head = current_head;
-                // NOTES: `1 << 25` is a heuristic large number
+                            }
-                int last_head = 1 << 25;
+                        }
-                for (int token_idx = token_end_idx - 1; token_idx >= token_start_idx; --token_idx) {
-                    auto current_head =
-                        __ldg(combined_nvl_head + token_idx * NUM_MAX_NVL_PEERS + lane_id);
-                    if (current_head < 0) {
-                        combined_nvl_head[token_idx * NUM_MAX_NVL_PEERS + lane_id] = -last_head - 1;
-                    } else {
-                        last_head = current_head;
                    }
                }
            }