Commit 7624bd05 authored by zhuwenwen's avatar zhuwenwen
Browse files

[qwen3-235b] MoE(TN&NN) configs for nmz TP=8

[qwen3-480b] MoE(TN) configs for nmz TP=8
[opt] 优化deepep相关代码
[fix] 修复deepseek moe模型的awq量化推理bug和精度问题, 修复awq模型的VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD设置位置, update_state,优化性能,去除冗余操作
pcie 解决custom cudagraph模式需要拷贝的问题,需要配合dtk进行使用
[feat] Switch default w8a8 gemm impl to blaslt. Support w8a8-fp8 GEMM backend.MoE 路由抓取:新增 router_capture 工具链与 envs 统一配置
[envs] set VLLM_CUSTOM_CACHE=1、VLLM_USE_FUSED_RMS_ROPE=1、VLLM_USE_FUSED_FILL_RMS_CAT=1、VLLM_USE_FLASH_ATTN_FP8=1、VLLM_USE_FLASH_MLA_FP8=1、update VLLM_USE_TOPK_RENORM
parent ad1d74cf
......@@ -58,6 +58,7 @@ from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
from vllm import _custom_ops as ops
from vllm.model_executor.utils import pad_weight, gemm_bank_conf
import vllm.envs as envs
FalconConfig = Union[HF_FalconConfig, RWConfig]
......@@ -393,7 +394,7 @@ class FalconModel(nn.Module):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.word_embeddings(input_ids)
......
......@@ -31,6 +31,7 @@ from typing import Optional, Union
import torch
from torch import nn
from transformers import Glm4Config
import vllm.envs as envs
class MultiModalConfigProxy:
......@@ -332,7 +333,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.get_input_embeddings(input_ids)
......
......@@ -38,6 +38,19 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE
try:
from vllm.model_executor.layers.fused_moe.router_capture import (
maybe_record_router_logits,
)
except ImportError:
def maybe_record_router_logits(
*,
layer_name: str,
router_logits: torch.Tensor,
top_k: int,
) -> None:
return None
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear,
......@@ -111,6 +124,8 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
):
super().__init__()
self.tp_size = get_tensor_model_parallel_world_size()
self._router_top_k = int(config.num_experts_per_tok)
self._router_capture_layer_name = prefix
if self.tp_size > config.num_experts:
raise ValueError(
......@@ -140,6 +155,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
if not (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()):
capture_enabled = envs.VLLM_MOE_ROUTER_CAPTURE
if capture_enabled:
maybe_record_router_logits(
layer_name=self._router_capture_layer_name,
router_logits=router_logits,
top_k=self._router_top_k,
)
final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits)
......@@ -453,7 +476,7 @@ class Qwen3MoeModel(nn.Module):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embed_tokens(input_ids)
......
......@@ -37,6 +37,7 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
from vllm.utils import W8a8GetCacheJSON
from vllm import _custom_ops as ops
from vllm.model_executor.utils import pad_weight, gemm_bank_conf
import vllm.envs as envs
class TeleChat2Model(LlamaModel):
......@@ -66,7 +67,7 @@ class TeleChat2Model(LlamaModel):
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
self.w8a8_strategy = envs.VLLM_W8A8_BACKEND
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
......
......@@ -95,6 +95,10 @@ class EagleProposer:
self.enable_expert_parallel = vllm_config.parallel_config.enable_expert_parallel
self.enable_dp_attention = vllm_config.parallel_config.enable_dp_attention
self.attn_tp_size = vllm_config.parallel_config.tensor_parallel_size
self.ep_sp = False
if self.enable_expert_parallel and self.dp_size > 1 and self.attn_tp_size > 1:
self.ep_sp = True
def propose(
self,
......@@ -194,8 +198,8 @@ class EagleProposer:
if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad
# num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions
self.hidden_states[:num_tokens] = target_hidden_states
......@@ -543,10 +547,9 @@ class EagleProposer:
# TODO(tms) : There are many cases where padding is enabled for
# prefills, causing unnecessary and excessive padding of activations.
if not self.enable_dp_attention and not envs.VLLM_ALL2ALL_BACKEND == "deepep_auto":
if dp_size == 1 or self.vllm_config.model_config.enforce_eager or envs.VLLM_ALL2ALL_BACKEND != 'naive':
# Early exit.
return 0, None
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
# Early exit.
return 0, None
try:
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
......@@ -569,6 +572,7 @@ class EagleProposer:
self,
num_tokens: int,
attn_metadata: Optional[dict[str, Any]] = None,
num_tokens_across_dp: Optional[torch.Tensor] = None,
) -> None:
if attn_metadata is not None and self.attn_metadata_cudagraph is None:
self.attn_metadata_cudagraph = attn_metadata[
......@@ -576,12 +580,13 @@ class EagleProposer:
# Padding for DP
num_input_tokens = num_tokens
num_pad, _ = self.get_dp_padding(num_tokens)
num_input_tokens += num_pad
# num_pad, _ = self.get_dp_padding(num_tokens)
# num_input_tokens += num_pad
with set_forward_context(attn_metadata,
self.vllm_config,
num_tokens=num_tokens):
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model(
self.input_ids[:num_input_tokens],
self.positions[:num_input_tokens],
......@@ -590,10 +595,13 @@ class EagleProposer:
if self.dp_size > 1 and (self.enable_expert_parallel or self.enable_dp_attention) and self.num_speculative_tokens > 1:
num_tokens = 1
if self.enable_dp_attention or self.ep_sp:
num_tokens = round_up(num_tokens, self.attn_tp_size)
# dp attention need all dp rank process same number tokens
if self.enable_dp_attention:
num_tokens = round_up(num_tokens, self.attn_tp_size)
num_pad, _ = self.get_dp_padding(num_tokens)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
num_tokens += num_pad
if not get_warming_up():
......@@ -621,19 +629,20 @@ class EagleProposer:
attn_metadata_cudagraph.num_actual_tokens = num_tokens
attn_metadata_cudagraph.num_decodes = num_tokens
attn_metadata_cudagraph.num_decode_tokens = num_tokens
self.attn_metadata_cudagraph.slot_mapping[:num_tokens] = (
attn_metadata_cudagraph.slot_mapping[:num_tokens] = (
attn_metadata.slot_mapping)
attn_metadata_cudagraph.decode.seq_lens[:num_tokens] = (
attn_metadata.decode.seq_lens)
self.attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = (
attn_metadata_cudagraph.query_start_loc[:num_tokens + 1] = (
attn_metadata.query_start_loc)
self.attn_metadata_cudagraph.decode.block_table[:num_tokens] = (
attn_metadata_cudagraph.decode.block_table[:num_tokens] = (
attn_metadata.decode.block_table)
with set_forward_context(attn_metadata,
self.vllm_config,
num_tokens=num_tokens):
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp):
self.model(
self.input_ids[:num_tokens],
self.positions[:num_tokens],
......
......@@ -515,7 +515,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# Add the sampled token(s) from the previous step (if any).
# This doesn't include "unverified" tokens like spec tokens.
num_new_tokens = len(new_token_ids)
if num_new_tokens > 0:
if num_new_tokens == 1:
req_state.output_token_ids.append(new_token_ids[-1])
elif num_new_tokens > 0:
req_state.output_token_ids.extend(
new_token_ids)
if len(spec_token_ids) > 0:
......@@ -537,13 +539,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# The request is not in the persistent batch.
# The request was either preempted and resumed later, or was not
# scheduled in the previous step and needs to be added again.
if not is_last_rank:
req_state = self.requests[req_id]
self.input_batch.add_request(req_state)
req_index = self.input_batch.req_id_to_index.get(req_id)
else:
req_ids_to_add.append(req_id)
continue
req_ids_to_add.append(req_id)
continue
# Update the persistent batch.
self.input_batch.num_computed_tokens_cpu[req_index] = (
......@@ -2257,7 +2254,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
#assert isinstance(self.drafter, EagleProposer)
if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
self.drafter.dummy_run(num_tokens, attn_metadata)
self.drafter.dummy_run(num_tokens, attn_metadata,
num_tokens_across_dp=num_tokens_across_dp)
# This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real
......
......@@ -115,8 +115,8 @@ class V1ZeroEagleProposer(EagleProposer):
if self.enable_dp_attention:
num_input_tokens = round_up(num_input_tokens, self.attn_tp_size)
num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
num_input_tokens += num_pad
# num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens)
# num_input_tokens += num_pad
# copy inputs to buffer for cudagraph
self.positions[:num_tokens] = target_positions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment