Commit 76695c0a authored by 王敏's avatar 王敏
Browse files

优化deepep相关代码

parent cda54326
...@@ -4822,13 +4822,9 @@ class VllmConfig: ...@@ -4822,13 +4822,9 @@ class VllmConfig:
if ep_sp or enable_dp_attention: if ep_sp or enable_dp_attention:
batch_size_capture_list = sorted(set([round_up(i, tp_size) for i in batch_size_capture_list])) batch_size_capture_list = sorted(set([round_up(i, tp_size) for i in batch_size_capture_list]))
if 1 not in batch_size_capture_list:
batch_size_capture_list.insert(0, 1)
else: else:
if ep_sp or enable_dp_attention: if ep_sp or enable_dp_attention:
batch_size_capture_list = sorted(set([round_up(i, tp_size) for i in batch_size_capture_list])) batch_size_capture_list = sorted(set([round_up(i, tp_size) for i in batch_size_capture_list]))
if 1 not in batch_size_capture_list:
batch_size_capture_list.insert(0, 1)
self.compilation_config.init_with_cudagraph_sizes( self.compilation_config.init_with_cudagraph_sizes(
batch_size_capture_list) batch_size_capture_list)
......
...@@ -103,7 +103,7 @@ class DeviceCommunicatorBase: ...@@ -103,7 +103,7 @@ class DeviceCommunicatorBase:
# as long as we use data parallel (coupled data parallel # as long as we use data parallel (coupled data parallel
# where all data parallel ranks execute forward together), # where all data parallel ranks execute forward together),
# we initialize the all2all manager used in expert parallel. # we initialize the all2all manager used in expert parallel.
use_ep = config.parallel_config.data_parallel_size > 1 use_ep = config.parallel_config.data_parallel_size > 1 and not config.parallel_config.enable_dp_attention
self.use_all2all = "ep" in unique_name and use_ep self.use_all2all = "ep" in unique_name and use_ep
self.all2all_manager: Optional[All2AllManagerBase] = None self.all2all_manager: Optional[All2AllManagerBase] = None
......
...@@ -203,6 +203,7 @@ if TYPE_CHECKING: ...@@ -203,6 +203,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
VLLM_USE_FUSED_FILL_RMS_CAT:bool = False VLLM_USE_FUSED_FILL_RMS_CAT:bool = False
VLLM_ENABLE_DEEPEP_HT_DEEPGEMM: bool = True VLLM_ENABLE_DEEPEP_HT_DEEPGEMM: bool = True
VLLM_ENABLE_DEEPEP_INT8_DISPATCH: bool = True
VLLM_ZERO_OVERHEAD_ENHANCE: bool = False VLLM_ZERO_OVERHEAD_ENHANCE: bool = False
VLLM_USE_FUSED_QA_KVA_GEMM: bool = False VLLM_USE_FUSED_QA_KVA_GEMM: bool = False
VLLM_V1_FAST_TOKEN_ID_COPY: bool = False VLLM_V1_FAST_TOKEN_ID_COPY: bool = False
...@@ -1317,6 +1318,11 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1317,6 +1318,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_DEEPEP_HT_DEEPGEMM": "VLLM_ENABLE_DEEPEP_HT_DEEPGEMM":
lambda: (os.getenv('VLLM_ENABLE_DEEPEP_HT_DEEPGEMM', '1').lower() in lambda: (os.getenv('VLLM_ENABLE_DEEPEP_HT_DEEPGEMM', '1').lower() in
("true", "1")), ("true", "1")),
# vLLM will use deepep int8 dispatch
"VLLM_ENABLE_DEEPEP_INT8_DISPATCH":
lambda: (os.getenv('VLLM_ENABLE_DEEPEP_INT8_DISPATCH', '1').lower() in
("true", "1")),
# Only quantized DeepSeek models supported. # Only quantized DeepSeek models supported.
# Unquantized versions are not supported. # Unquantized versions are not supported.
......
...@@ -136,8 +136,8 @@ def set_forward_context( ...@@ -136,8 +136,8 @@ def set_forward_context(
forward_start_time = time.perf_counter() forward_start_time = time.perf_counter()
dp_metadata: Optional[DPMetadata] = None dp_metadata: Optional[DPMetadata] = None
dp_size = vllm_config.parallel_config.data_parallel_size dp_size = vllm_config.parallel_config.data_parallel_size
use_navie_ep = envs.VLLM_ALL2ALL_BACKEND == 'naive' and dp_size > 1 and vllm_config.parallel_config.enable_expert_parallel use_navie_all2all = envs.VLLM_ALL2ALL_BACKEND == 'naive' and dp_size > 1
if use_navie_ep and dp_size > 1 and ( if use_navie_all2all and dp_size > 1 and (
attn_metadata is not None or num_tokens is not None): attn_metadata is not None or num_tokens is not None):
dp_metadata = DPMetadata.make(vllm_config.parallel_config, dp_metadata = DPMetadata.make(vllm_config.parallel_config,
attn_metadata, num_tokens or 0, attn_metadata, num_tokens or 0,
......
...@@ -192,7 +192,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ...@@ -192,7 +192,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and moe.quant_config.block_shape and moe.quant_config.block_shape
== DEEPEP_QUANT_BLOCK_SHAPE) == DEEPEP_QUANT_BLOCK_SHAPE)
use_int8_dispatch = False use_int8_dispatch = moe.quant_config.quant_dtype == torch.int8 and envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM
ll_prepare_finalize = DeepEPLLPrepareAndFinalize( ll_prepare_finalize = DeepEPLLPrepareAndFinalize(
ll_handle, ll_handle,
...@@ -249,7 +249,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ...@@ -249,7 +249,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
and moe.quant_config.block_shape and moe.quant_config.block_shape
== DEEPEP_QUANT_BLOCK_SHAPE) == DEEPEP_QUANT_BLOCK_SHAPE)
use_int8_dispatch = moe.quant_config.quant_dtype == torch.int8 use_int8_dispatch = moe.quant_config.quant_dtype == torch.int8 and envs.VLLM_ENABLE_DEEPEP_HT_DEEPGEMM
# Note (varun): Whether to use FP8 dispatch or not needs some # Note (varun): Whether to use FP8 dispatch or not needs some
# profiling. Turning it off for now. # profiling. Turning it off for now.
......
...@@ -96,6 +96,10 @@ class EagleProposer: ...@@ -96,6 +96,10 @@ class EagleProposer:
self.enable_dp_attention = vllm_config.parallel_config.enable_dp_attention self.enable_dp_attention = vllm_config.parallel_config.enable_dp_attention
self.attn_tp_size = vllm_config.parallel_config.tensor_parallel_size self.attn_tp_size = vllm_config.parallel_config.tensor_parallel_size
self.ep_sp = False
if self.enable_expert_parallel and self.dp_size > 1 and self.attn_tp_size > 1:
self.ep_sp = True
def propose( def propose(
self, self,
# [num_tokens] # [num_tokens]
...@@ -193,8 +197,10 @@ class EagleProposer: ...@@ -193,8 +197,10 @@ class EagleProposer:
if self.enable_dp_attention: if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size) num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad # num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph # copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions self.positions[:num_tokens] = target_positions
self.hidden_states[:num_tokens] = target_hidden_states self.hidden_states[:num_tokens] = target_hidden_states
...@@ -542,11 +548,10 @@ class EagleProposer: ...@@ -542,11 +548,10 @@ class EagleProposer:
# #
# TODO(tms) : There are many cases where padding is enabled for # TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations. # prefills, causing unnecessary and excessive padding of activations.
if not self.enable_dp_attention and not envs.VLLM_ALL2ALL_BACKEND == "deepep_auto": if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or envs.VLLM_ALL2ALL_BACKEND != 'naive': # Early exit.
# Early exit. return 0, None
return 0, None
try: try:
num_tokens_across_dp = DPMetadata.num_tokens_across_dp( num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
...@@ -569,6 +574,7 @@ class EagleProposer: ...@@ -569,6 +574,7 @@ class EagleProposer:
self, self,
num_tokens: int, num_tokens: int,
attn_metadata: Optional[dict[str, Any]] = None, attn_metadata: Optional[dict[str, Any]] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None,
) -> None: ) -> None:
if attn_metadata is not None and self.attn_metadata_cudagraph is None: if attn_metadata is not None and self.attn_metadata_cudagraph is None:
self.attn_metadata_cudagraph = attn_metadata[ self.attn_metadata_cudagraph = attn_metadata[
...@@ -576,12 +582,13 @@ class EagleProposer: ...@@ -576,12 +582,13 @@ class EagleProposer:
# Padding for DP # Padding for DP
num_input_tokens = num_tokens num_input_tokens = num_tokens
num_pad, _ = self.get_dp_padding(num_tokens) # num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
num_input_tokens += num_pad # num_input_tokens += num_pad
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
self.vllm_config, self.vllm_config,
num_tokens=num_tokens): num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model( self.model(
self.input_ids[:num_input_tokens], self.input_ids[:num_input_tokens],
self.positions[:num_input_tokens], self.positions[:num_input_tokens],
...@@ -590,10 +597,12 @@ class EagleProposer: ...@@ -590,10 +597,12 @@ class EagleProposer:
if self.dp_size > 1 and (self.enable_expert_parallel or self.enable_dp_attention) and self.num_speculative_tokens > 1: if self.dp_size > 1 and (self.enable_expert_parallel or self.enable_dp_attention) and self.num_speculative_tokens > 1:
num_tokens = 1 num_tokens = 1
if self.enable_dp_attention or self.ep_sp:
num_tokens = round_up(num_tokens, self.attn_tp_size)
# dp attention need all dp rank process same number tokens # dp attention need all dp rank process same number tokens
if self.enable_dp_attention: if self.enable_dp_attention:
num_tokens = round_up(num_tokens, self.attn_tp_size) num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
num_pad, _ = self.get_dp_padding(num_tokens)
num_tokens += num_pad num_tokens += num_pad
if not get_warming_up(): if not get_warming_up():
...@@ -621,19 +630,20 @@ class EagleProposer: ...@@ -621,19 +630,20 @@ class EagleProposer:
attn_metadata_cudagraph.num_actual_tokens = num_tokens attn_metadata_cudagraph.num_actual_tokens = num_tokens
attn_metadata_cudagraph.num_decodes = num_tokens attn_metadata_cudagraph.num_decodes = num_tokens
attn_metadata_cudagraph.num_decode_tokens = num_tokens attn_metadata_cudagraph.num_decode_tokens = num_tokens
self.attn_metadata_cudagraph.slot_mapping[:num_tokens] = ( attn_metadata_cudagraph.slot_mapping[:num_tokens] = (
attn_metadata.slot_mapping) attn_metadata.slot_mapping)
attn_metadata_cudagraph.decode.seq_lens[:num_tokens] = ( attn_metadata_cudagraph.decode.seq_lens[:num_tokens] = (
attn_metadata.decode.seq_lens) attn_metadata.decode.seq_lens)
self.attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = ( attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = (
attn_metadata.query_start_loc) attn_metadata.query_start_loc)
self.attn_metadata_cudagraph.decode.block_table[:num_tokens] = ( attn_metadata_cudagraph.decode.block_table[:num_tokens] = (
attn_metadata.decode.block_table) attn_metadata.decode.block_table)
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
self.vllm_config, self.vllm_config,
num_tokens=num_tokens): num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model( self.model(
self.input_ids[:num_tokens], self.input_ids[:num_tokens],
self.positions[:num_tokens], self.positions[:num_tokens],
......
...@@ -1276,11 +1276,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -1276,11 +1276,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# #
# TODO(tms) : There are many cases where padding is enabled for # TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations. # prefills, causing unnecessary and excessive padding of activations.
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
if not self.enable_dp_attention and not envs.VLLM_ALL2ALL_BACKEND == "deepep_auto": # Early exit.
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or envs.VLLM_ALL2ALL_BACKEND != 'naive': return 0, None
# Early exit.
return 0, None
num_tokens_across_dp = DPMetadata.num_tokens_across_dp( num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
num_tokens, dp_size, dp_rank) num_tokens, dp_size, dp_rank)
...@@ -2246,7 +2244,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -2246,7 +2244,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if self.speculative_config and self.speculative_config.use_eagle() and not is_profile: if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
#assert isinstance(self.drafter, EagleProposer) #assert isinstance(self.drafter, EagleProposer)
if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer): if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
self.drafter.dummy_run(num_tokens, attn_metadata) self.drafter.dummy_run(num_tokens, attn_metadata,
num_tokens_across_dp=num_tokens_across_dp)
# This is necessary to avoid blocking DP. # This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real # For dummy runs, we typically skip EPLB since we don't have any real
......
...@@ -114,8 +114,9 @@ class V1ZeroEagleProposer(EagleProposer): ...@@ -114,8 +114,9 @@ class V1ZeroEagleProposer(EagleProposer):
if self.enable_dp_attention: if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size) num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad # num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph # copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions self.positions[:num_tokens] = target_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment