Commit ab1acdce authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-wm-1215' into 'v0.9.2-dev-ds'

优化高吞吐模式num_sms

See merge request dcutoolkit/deeplearing/vllm!297
parents b2db7ca2 10400c58
...@@ -173,6 +173,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): ...@@ -173,6 +173,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
if self.internode: if self.internode:
num_rdma_bytes = int(1e9/2) #1024 * 1024 * 1024 num_rdma_bytes = int(1e9/2) #1024 * 1024 * 1024
num_qps_per_rank = 30 #self.num_sms // 2 num_qps_per_rank = 30 #self.num_sms // 2
self.num_sms = 30
# import deep_ep # import deep_ep
# num_nvl_bytes, num_rdma_bytes = 0, 0 # num_nvl_bytes, num_rdma_bytes = 0, 0
...@@ -184,6 +185,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): ...@@ -184,6 +185,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
else: else:
num_rdma_bytes = 0 num_rdma_bytes = 0
num_qps_per_rank = 1 num_qps_per_rank = 1
self.num_sms = 60
assert num_rdma_bytes is not None assert num_rdma_bytes is not None
assert num_qps_per_rank is not None assert num_qps_per_rank is not None
......
...@@ -12,7 +12,7 @@ import torch ...@@ -12,7 +12,7 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.fused_moe.utils import _resize_cache
from vllm.utils import cdiv from vllm.utils import cdiv, async_tensor_h2d
# #
# This file defines a set of base classes used to make MoE kernels more modular. # This file defines a set of base classes used to make MoE kernels more modular.
...@@ -97,6 +97,8 @@ class FusedMoEActivationFormat(Enum): ...@@ -97,6 +97,8 @@ class FusedMoEActivationFormat(Enum):
BatchedExperts = "batched_experts", BatchedExperts = "batched_experts",
@dataclass @dataclass
class ExpertTokensMetadata: class ExpertTokensMetadata:
""" """
...@@ -110,11 +112,16 @@ class ExpertTokensMetadata: ...@@ -110,11 +112,16 @@ class ExpertTokensMetadata:
def make_from_list( def make_from_list(
expert_num_tokens_list: list[int], device: str expert_num_tokens_list: list[int], device: str
) -> "ExpertTokensMetadata": ) -> "ExpertTokensMetadata":
# expert_num_tokens_cpu = torch.tensor(
# expert_num_tokens_list, device="cpu", dtype=torch.int32
# )
expert_num_tokens_cpu = torch.tensor( expert_num_tokens_cpu = torch.tensor(
expert_num_tokens_list, device="cpu", dtype=torch.int32 expert_num_tokens_list, device="cpu", dtype=torch.int32, pin_memory=True
) )
expert_num_tokens = expert_num_tokens_cpu.to(device=device, non_blocking=True)
return ExpertTokensMetadata( return ExpertTokensMetadata(
expert_num_tokens=expert_num_tokens_cpu.to(device, non_blocking=True), expert_num_tokens=expert_num_tokens,
expert_num_tokens_cpu=expert_num_tokens_cpu, expert_num_tokens_cpu=expert_num_tokens_cpu,
) )
......
...@@ -547,41 +547,6 @@ def _fwd_kernel_ep_scatter_2( ...@@ -547,41 +547,6 @@ def _fwd_kernel_ep_scatter_2(
SCALE_HIDDEN_SIZE: tl.constexpr, SCALE_HIDDEN_SIZE: tl.constexpr,
SCALE_HIDDEN_SIZE_PAD: tl.constexpr, SCALE_HIDDEN_SIZE_PAD: tl.constexpr,
): ):
# start_token_id = tl.program_id(0)
# grid_num = tl.num_programs(0)
# offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
# mask = offset_in < HIDDEN_SIZE
# offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
# mask_s = offset_in_s < SCALE_HIDDEN_SIZE
# for token_id in range(start_token_id, total_token_num, grid_num):
# to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
# to_copy_s = tl.load(
# recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s
# )
# for topk_index in tl.range(0, topk_num, 1, num_stages=4):
# expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
# if HAS_EXPERT_MAP:
# expert_id = apply_expert_map(expert_id, expert_map)
# if expert_id >= 0:
# dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
# tl.store(
# output_index + token_id * output_index_stride0 + topk_index,
# dest_token_index,
# )
# output_tensor_ptr = (
# output_tensor + dest_token_index * output_tensor_stride0
# )
# output_tensor_scale_ptr = (
# output_tensor_scale + dest_token_index * output_tensor_scale_stride0
# )
# tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
# tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s)
start_token_id = tl.program_id(0) start_token_id = tl.program_id(0)
grid_num = tl.num_programs(0) grid_num = tl.num_programs(0)
...@@ -720,43 +685,6 @@ def _fwd_kernel_ep_gather( ...@@ -720,43 +685,6 @@ def _fwd_kernel_ep_gather(
HAS_EXPERT_MAP: tl.constexpr, HAS_EXPERT_MAP: tl.constexpr,
BLOCK_D: tl.constexpr, BLOCK_D: tl.constexpr,
): ):
# cur_block = tl.program_id(0)
# start_cur_token = tl.program_id(1)
# grid_num = tl.num_programs(1)
# for cur_token in range(start_cur_token, total_token_num, grid_num):
# off_d = tl.arange(0, BLOCK_D)
# accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
# for topk_index in range(0, topk_num):
# expert_id = tl.load(
# recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
# )
# if HAS_EXPERT_MAP:
# expert_id = apply_expert_map(expert_id, expert_map)
# if expert_id >= 0:
# source_token_index = tl.load(
# input_index + cur_token * input_index_stride0 + topk_index
# )
# acc_weight = tl.load(
# recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
# )
# tmp = tl.load(
# input_tensor
# + source_token_index * input_tensor_stride0
# + cur_block * BLOCK_D
# + off_d
# )
# accumulator += tmp.to(tl.float32) * acc_weight
# tl.store(
# output_tensor
# + cur_token * output_tensor_stride0
# + cur_block * BLOCK_D
# + off_d,
# accumulator.to(output_tensor.dtype.element_ty),
# )
cur_block_int32 = tl.program_id(0) cur_block_int32 = tl.program_id(0)
cur_block = cur_block_int32.to(tl.int64) cur_block = cur_block_int32.to(tl.int64)
...@@ -856,6 +784,7 @@ def deepgemm_moe_permute( ...@@ -856,6 +784,7 @@ def deepgemm_moe_permute(
expert_num_tokens: Optional[torch.Tensor] = None, expert_num_tokens: Optional[torch.Tensor] = None,
expert_num_tokens_cpu: Optional[torch.Tensor] = None, expert_num_tokens_cpu: Optional[torch.Tensor] = None,
aq_out: torch.Tensor | None = None, aq_out: torch.Tensor | None = None,
M_sum: int | None = None,
): ):
assert aq.ndim == 2 assert aq.ndim == 2
assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids" assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids"
...@@ -864,13 +793,14 @@ def deepgemm_moe_permute( ...@@ -864,13 +793,14 @@ def deepgemm_moe_permute(
block_m = block_shape[0] block_m = block_shape[0]
M_sum = compute_aligned_M( if M_sum is None:
M=topk_ids.size(0), M_sum = compute_aligned_M(
num_topk=topk_ids.size(1), M=topk_ids.size(0),
local_num_experts=local_num_experts, num_topk=topk_ids.size(1),
alignment=block_m, local_num_experts=local_num_experts,
expert_num_tokens_cpu=expert_num_tokens_cpu, alignment=block_m,
) expert_num_tokens_cpu=expert_num_tokens_cpu,
)
expert_start_loc = torch.empty( expert_start_loc = torch.empty(
(local_num_experts), device=device, dtype=torch.int32 (local_num_experts), device=device, dtype=torch.int32
......
...@@ -346,6 +346,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod) ...@@ -346,6 +346,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
expert_num_tokens=expert_num_tokens, expert_num_tokens=expert_num_tokens,
expert_num_tokens_cpu=expert_num_tokens_cpu, expert_num_tokens_cpu=expert_num_tokens_cpu,
aq_out=a1q_perm, aq_out=a1q_perm,
M_sum=M_sum
) )
# if expert_map is not None: # if expert_map is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment