量化scales传输size优化

830124e1 · lishen · d0fcf024 · 830124e1 · 830124e1 · 830124e1
Commit 830124e1 authored Feb 04, 2026 by lishen
5 changed files
--- a/csrc/config.hpp
+++ b/csrc/config.hpp
@@ -135,8 +135,8 @@ struct LowLatencyLayout {
    }
    LowLatencyLayout(void *rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden,
-                     int num_ranks, int num_experts) {
+                     int num_ranks, int num_experts, int quant_group_size=0) {
-        const int num_scales = hidden / QUANTIZATION_GROUPSIZE;
+        const int num_scales = quant_group_size == 0 ? 4 : hidden / QUANTIZATION_GROUPSIZE;   // 应该是1，但是代码中为了满足int4对齐
        // Dispatch and combine layout:
        //  - 2 symmetric odd/even send buffer
@@ -205,9 +205,9 @@ struct LowLatencyLayout {
 };
 inline size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden,
-                                             int num_ranks, int num_experts) {
+                                             int num_ranks, int num_experts, int quant_group_size=0) {
    auto num_bytes =
-        LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
+        LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts, quant_group_size)
            .total_bytes;
    return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) *
           NUM_BUFFER_ALIGNMENT_BYTES;

--- a/csrc/deep_ep.cu
+++ b/csrc/deep_ep.cu
@@ -1271,10 +1271,10 @@ Buffer::internode_combine(
 #endif
 }
-void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) {
+void Buffer::clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts, int quant_group_size) {
    EP_HOST_ASSERT(low_latency_mode);
-    auto layout = LowLatencyLayout(rdma_buffer_ptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts);
+    auto layout = LowLatencyLayout(rdma_buffer_ptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts, quant_group_size);
    auto clean_meta_0 = layout.buffers[0].clean_meta();
    auto clean_meta_1 = layout.buffers[1].clean_meta();
@@ -1311,7 +1311,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
    auto num_local_experts = num_experts / num_ranks;
    // Buffer control
-    LowLatencyLayout layout(rdma_buffer_ptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts);
+    LowLatencyLayout layout(rdma_buffer_ptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts, quant_group_size);
    EP_HOST_ASSERT(layout.total_bytes <= num_rdma_bytes);
    auto buffer = layout.buffers[low_latency_buffer_idx];
    auto next_buffer = layout.buffers[low_latency_buffer_idx ^= 1];

--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -172,7 +172,7 @@ public:
        std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream);
    void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden,
-                                  int num_experts);
+                                  int num_experts, int quant_group_size=0);
    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
    low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_idx,

--- a/csrc/kernels/internode_ll.cu
+++ b/csrc/kernels/internode_ll.cu
@@ -210,13 +210,13 @@ __global__ __launch_bounds__(16 * kWarpSize, 1) void
    // Message package: hidden data, FP8 scales, index at source
    // NOTES: currently we have 3 reserved int fields for future use
    using vec_t = typename std::conditional<kUseQuant8Bit, int2, int4>::type;
-    constexpr size_t num_bytes_per_msg = sizeof(int4) + (kUseQuant8Bit ? (kHidden + kNumScales * sizeof(float)) : (kHidden * sizeof(hip_bfloat16)));
+    constexpr size_t num_bytes_per_msg = sizeof(int4) + 
+        (kUseQuant8Bit ? (kHidden + (kQuantGroupSize == 0 ? 4 : kNumScales) * sizeof(float)) : (kHidden * sizeof(hip_bfloat16)));
    EP_STATIC_ASSERT(num_bytes_per_msg % sizeof(int4) == 0, "Invalid message size");
    constexpr size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
    // Expert counts
-    constexpr int kNumMaxWarpGroups = 1024 / kWarpSize;
+    __shared__ int shared_num_tokens_sent_per_expert[kMaxNumWarps];
-    __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
    // Sending phase
    if ((phases & LOW_LATENCY_SEND_PHASE) == 0)
@@ -230,7 +230,7 @@ __global__ __launch_bounds__(16 * kWarpSize, 1) void
        constexpr int kNumThreadPerGroup = QUANTIZATION_GROUPSIZE / kNumElemsPerRead;
        // EP_DEVICE_ASSERT(kHidden % kNumElemsPerRead == 0);
        EP_STATIC_ASSERT(kNumElemsPerRead * kWarpSize % kNumPerChannels == 0, "Invalid vectorization");
-        const auto num_threads = (num_warps - 1) * kWarpSize;
+        const auto num_threads = num_warps * kWarpSize;
        constexpr int hidden_bf16_int4 = kHidden / kNumElemsPerRead;
        for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
@@ -375,7 +375,7 @@ __global__ __launch_bounds__(16 * kWarpSize, 1) void
                atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG);
        }
        // This SM should be responsible for some destination experts, read `topk_idx` for them
-        int expert_count[kNumMaxWarpGroups] = {0};
+        int expert_count[kMaxNumWarps] = {0};
        const auto expert_begin_idx = sm_id * num_warp_groups;
        const auto expert_end_idx = min(expert_begin_idx + num_warp_groups, num_experts);
@@ -465,7 +465,7 @@ LOW_LATENCY_DISPATCH_RECV:
                                       (kQuantGroupSize == 0 ? 1 : num_aligned_scales);
        // Shared between sub-warps in warp groups
-        __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups];
+        __shared__ int shared_num_recv_tokens[kMaxNumWarps], shared_recv_token_begin_idx[kMaxNumWarps];
        // Wait tokens to arrive
        // NOTES: using sub-warp 1 to overlap with sub-warp 0

--- a/deep_ep/buffer.py
+++ b/deep_ep/buffer.py
@@ -212,7 +212,7 @@ class Buffer:
    @staticmethod
    def get_low_latency_rdma_size_hint(
-        num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int
+        num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int, quant_group_size: int = 0
    ) -> int:
        """
        Get a minimum size requirement for the RDMA buffer. The size calculation will be done with BF16.
@@ -222,12 +222,13 @@ class Buffer:
            hidden: the hidden dimension of each token.
            num_ranks: the number of EP group ranks.
            num_experts: the number of all experts.
+            quant_group_size: the group size if use quant.
        Returns:
            size: the RDMA buffer size recommended.
        """
        return deep_ep_cpp.get_low_latency_rdma_size_hint(
-            num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts
+            num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts, quant_group_size
        )
    def get_comm_stream(self) -> torch.Stream:
@@ -823,7 +824,7 @@ class Buffer:
        return combined_x, combined_topk_weights, EventOverlap(event)
    def clean_low_latency_buffer(
-        self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int
+        self, num_max_dispatch_tokens_per_rank: int, hidden: int, num_experts: int, quant_group_size: int = 0
    ) -> None:
        """
        As low-latency kernels require part of the buffer to be zero-initialized, so it is vital to clean the buffer
@@ -835,8 +836,9 @@ class Buffer:
            num_max_dispatch_tokens_per_rank: the maximum number of tokens to dispatch, all the ranks must hold the same value.
            hidden: the hidden dimension of each token.
            num_experts: the number of all experts.
+            quant_group_size: the group size if use quant.
        """
-        self.runtime.clean_low_latency_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts)
+        self.runtime.clean_low_latency_buffer(num_max_dispatch_tokens_per_rank, hidden, num_experts, quant_group_size)
    # noinspection PyTypeChecker
    def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,