Commit 7624bd05 authored by zhuwenwen's avatar zhuwenwen
Browse files

[qwen3-235b] MoE(TN&NN) configs for nmz TP=8

[qwen3-480b] MoE(TN) configs for nmz TP=8
[opt] 优化deepep相关代码
[fix] 修复deepseek moe模型的awq量化推理bug和精度问题, 修复awq模型的VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD设置位置, update_state,优化性能,去除冗余操作
pcie 解决custom cudagraph模式需要拷贝的问题,需要配合dtk进行使用
[feat] Switch default w8a8 gemm impl to blaslt. Support w8a8-fp8 GEMM backend.MoE 路由抓取:新增 router_capture 工具链与 envs 统一配置
[envs] set VLLM_CUSTOM_CACHE=1、VLLM_USE_FUSED_RMS_ROPE=1、VLLM_USE_FUSED_FILL_RMS_CAT=1、VLLM_USE_FLASH_ATTN_FP8=1、VLLM_USE_FLASH_MLA_FP8=1、update VLLM_USE_TOPK_RENORM
parent ad1d74cf
...@@ -58,6 +58,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter, ...@@ -58,6 +58,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.utils import pad_weight, gemm_bank_conf from vllm.model_executor.utils import pad_weight, gemm_bank_conf
import vllm.envs as envs
FalconConfig = Union[HF_FalconConfig, RWConfig] FalconConfig = Union[HF_FalconConfig, RWConfig]
...@@ -393,7 +394,7 @@ class FalconModel(nn.Module): ...@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1' self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1' self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1')) self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.word_embeddings(input_ids) return self.word_embeddings(input_ids)
......
...@@ -31,6 +31,7 @@ from typing import Optional, Union ...@@ -31,6 +31,7 @@ from typing import Optional, Union
import torch import torch
from torch import nn from torch import nn
from transformers import Glm4Config from transformers import Glm4Config
import vllm.envs as envs
class MultiModalConfigProxy: class MultiModalConfigProxy:
...@@ -332,7 +333,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -332,7 +333,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1' self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1' self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1')) self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids) return self.model.get_input_embeddings(input_ids)
......
...@@ -38,6 +38,19 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size ...@@ -38,6 +38,19 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
try:
from vllm.model_executor.layers.fused_moe.router_capture import (
maybe_record_router_logits,
)
except ImportError:
def maybe_record_router_logits(
*,
layer_name: str,
router_logits: torch.Tensor,
top_k: int,
) -> None:
return None
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
...@@ -111,6 +124,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -111,6 +124,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
): ):
super().__init__() super().__init__()
self.tp_size = get_tensor_model_parallel_world_size() self.tp_size = get_tensor_model_parallel_world_size()
self._router_top_k = int(config.num_experts_per_tok)
self._router_capture_layer_name = prefix
if self.tp_size > config.num_experts: if self.tp_size > config.num_experts:
raise ValueError( raise ValueError(
...@@ -140,6 +155,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -140,6 +155,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
# router_logits: (num_tokens, n_experts) # router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states) router_logits, _ = self.gate(hidden_states)
if not (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()):
capture_enabled = envs.VLLM_MOE_ROUTER_CAPTURE
if capture_enabled:
maybe_record_router_logits(
layer_name=self._router_capture_layer_name,
router_logits=router_logits,
top_k=self._router_top_k,
)
final_hidden_states = self.experts(hidden_states=hidden_states, final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits) router_logits=router_logits)
...@@ -453,7 +476,7 @@ class Qwen3MoeModel(nn.Module): ...@@ -453,7 +476,7 @@ class Qwen3MoeModel(nn.Module):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1' self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1' self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1')) self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embed_tokens(input_ids) return self.embed_tokens(input_ids)
......
...@@ -37,6 +37,7 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, ...@@ -37,6 +37,7 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
from vllm.utils import W8a8GetCacheJSON from vllm.utils import W8a8GetCacheJSON
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.utils import pad_weight, gemm_bank_conf from vllm.model_executor.utils import pad_weight, gemm_bank_conf
import vllm.envs as envs
class TeleChat2Model(LlamaModel): class TeleChat2Model(LlamaModel):
...@@ -66,7 +67,7 @@ class TeleChat2Model(LlamaModel): ...@@ -66,7 +67,7 @@ class TeleChat2Model(LlamaModel):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1' self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1' self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1')) self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
......
...@@ -96,6 +96,10 @@ class EagleProposer: ...@@ -96,6 +96,10 @@ class EagleProposer:
self.enable_dp_attention = vllm_config.parallel_config.enable_dp_attention self.enable_dp_attention = vllm_config.parallel_config.enable_dp_attention
self.attn_tp_size = vllm_config.parallel_config.tensor_parallel_size self.attn_tp_size = vllm_config.parallel_config.tensor_parallel_size
self.ep_sp = False
if self.enable_expert_parallel and self.dp_size > 1 and self.attn_tp_size > 1:
self.ep_sp = True
def propose( def propose(
self, self,
# [num_tokens] # [num_tokens]
...@@ -194,8 +198,8 @@ class EagleProposer: ...@@ -194,8 +198,8 @@ class EagleProposer:
if self.enable_dp_attention: if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size) num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) # num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad # num_input_tokens += num_pad
# copy inputs to buffer for cudagraph # copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions self.positions[:num_tokens] = target_positions
self.hidden_states[:num_tokens] = target_hidden_states self.hidden_states[:num_tokens] = target_hidden_states
...@@ -543,8 +547,7 @@ class EagleProposer: ...@@ -543,8 +547,7 @@ class EagleProposer:
# TODO(tms) : There are many cases where padding is enabled for # TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations. # prefills, causing unnecessary and excessive padding of activations.
if not self.enable_dp_attention and not envs.VLLM_ALL2ALL_BACKEND == "deepep_auto": if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or envs.VLLM_ALL2ALL_BACKEND != 'naive':
# Early exit. # Early exit.
return 0, None return 0, None
...@@ -569,6 +572,7 @@ class EagleProposer: ...@@ -569,6 +572,7 @@ class EagleProposer:
self, self,
num_tokens: int, num_tokens: int,
attn_metadata: Optional[dict[str, Any]] = None, attn_metadata: Optional[dict[str, Any]] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None,
) -> None: ) -> None:
if attn_metadata is not None and self.attn_metadata_cudagraph is None: if attn_metadata is not None and self.attn_metadata_cudagraph is None:
self.attn_metadata_cudagraph = attn_metadata[ self.attn_metadata_cudagraph = attn_metadata[
...@@ -576,12 +580,13 @@ class EagleProposer: ...@@ -576,12 +580,13 @@ class EagleProposer:
# Padding for DP # Padding for DP
num_input_tokens = num_tokens num_input_tokens = num_tokens
num_pad, _ = self.get_dp_padding(num_tokens) # num_pad, _ = self.get_dp_padding(num_tokens)
num_input_tokens += num_pad # num_input_tokens += num_pad
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
self.vllm_config, self.vllm_config,
num_tokens=num_tokens): num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model( self.model(
self.input_ids[:num_input_tokens], self.input_ids[:num_input_tokens],
self.positions[:num_input_tokens], self.positions[:num_input_tokens],
...@@ -590,10 +595,13 @@ class EagleProposer: ...@@ -590,10 +595,13 @@ class EagleProposer:
if self.dp_size > 1 and (self.enable_expert_parallel or self.enable_dp_attention) and self.num_speculative_tokens > 1: if self.dp_size > 1 and (self.enable_expert_parallel or self.enable_dp_attention) and self.num_speculative_tokens > 1:
num_tokens = 1 num_tokens = 1
if self.enable_dp_attention or self.ep_sp:
num_tokens = round_up(num_tokens, self.attn_tp_size)
# dp attention need all dp rank process same number tokens # dp attention need all dp rank process same number tokens
if self.enable_dp_attention: if self.enable_dp_attention:
num_tokens = round_up(num_tokens, self.attn_tp_size) num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
num_pad, _ = self.get_dp_padding(num_tokens)
num_tokens += num_pad num_tokens += num_pad
if not get_warming_up(): if not get_warming_up():
...@@ -621,19 +629,20 @@ class EagleProposer: ...@@ -621,19 +629,20 @@ class EagleProposer:
attn_metadata_cudagraph.num_actual_tokens = num_tokens attn_metadata_cudagraph.num_actual_tokens = num_tokens
attn_metadata_cudagraph.num_decodes = num_tokens attn_metadata_cudagraph.num_decodes = num_tokens
attn_metadata_cudagraph.num_decode_tokens = num_tokens attn_metadata_cudagraph.num_decode_tokens = num_tokens
self.attn_metadata_cudagraph.slot_mapping[:num_tokens] = ( attn_metadata_cudagraph.slot_mapping[:num_tokens] = (
attn_metadata.slot_mapping) attn_metadata.slot_mapping)
attn_metadata_cudagraph.decode.seq_lens[:num_tokens] = ( attn_metadata_cudagraph.decode.seq_lens[:num_tokens] = (
attn_metadata.decode.seq_lens) attn_metadata.decode.seq_lens)
self.attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = ( attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = (
attn_metadata.query_start_loc) attn_metadata.query_start_loc)
self.attn_metadata_cudagraph.decode.block_table[:num_tokens] = ( attn_metadata_cudagraph.decode.block_table[:num_tokens] = (
attn_metadata.decode.block_table) attn_metadata.decode.block_table)
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
self.vllm_config, self.vllm_config,
num_tokens=num_tokens): num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model( self.model(
self.input_ids[:num_tokens], self.input_ids[:num_tokens],
self.positions[:num_tokens], self.positions[:num_tokens],
......
...@@ -515,7 +515,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -515,7 +515,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# Add the sampled token(s) from the previous step (if any). # Add the sampled token(s) from the previous step (if any).
# This doesn't include "unverified" tokens like spec tokens. # This doesn't include "unverified" tokens like spec tokens.
num_new_tokens = len(new_token_ids) num_new_tokens = len(new_token_ids)
if num_new_tokens > 0: if num_new_tokens == 1:
req_state.output_token_ids.append(new_token_ids[-1])
elif num_new_tokens > 0:
req_state.output_token_ids.extend( req_state.output_token_ids.extend(
new_token_ids) new_token_ids)
if len(spec_token_ids) > 0: if len(spec_token_ids) > 0:
...@@ -537,11 +539,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -537,11 +539,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# The request is not in the persistent batch. # The request is not in the persistent batch.
# The request was either preempted and resumed later, or was not # The request was either preempted and resumed later, or was not
# scheduled in the previous step and needs to be added again. # scheduled in the previous step and needs to be added again.
if not is_last_rank:
req_state = self.requests[req_id]
self.input_batch.add_request(req_state)
req_index = self.input_batch.req_id_to_index.get(req_id)
else:
req_ids_to_add.append(req_id) req_ids_to_add.append(req_id)
continue continue
...@@ -2257,7 +2254,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin): ...@@ -2257,7 +2254,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if self.speculative_config and self.speculative_config.use_eagle() and not is_profile: if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
#assert isinstance(self.drafter, EagleProposer) #assert isinstance(self.drafter, EagleProposer)
if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer): if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
self.drafter.dummy_run(num_tokens, attn_metadata) self.drafter.dummy_run(num_tokens, attn_metadata,
num_tokens_across_dp=num_tokens_across_dp)
# This is necessary to avoid blocking DP. # This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real # For dummy runs, we typically skip EPLB since we don't have any real
......
...@@ -115,8 +115,8 @@ class V1ZeroEagleProposer(EagleProposer): ...@@ -115,8 +115,8 @@ class V1ZeroEagleProposer(EagleProposer):
if self.enable_dp_attention: if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size) num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) # num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad # num_input_tokens += num_pad
# copy inputs to buffer for cudagraph # copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions self.positions[:num_tokens] = target_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment