Commit b956fc64 authored by zhuwenwen's avatar zhuwenwen
Browse files

[fix]解决moe_fused_gate编译错误,去掉mla中mtp部分的修改

restore the default settings of disable_cascade_attn
add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros)
set default_max_num_batched_tokens = 10240
update qwen3_moe of layernorm
parent 1a9b2fa9
......@@ -219,9 +219,14 @@ class Attention(nn.Module):
if self.use_output:
output_shape = (output_shape
if output_shape is not None else query.shape)
output = torch.zeros(output_shape,
dtype=query.dtype,
device=query.device)
if envs.VLLM_USE_OPT_ZEROS:
output = torch.empty(output_shape,
dtype=query.dtype,
device=query.device)
else:
output = torch.zeros(output_shape,
dtype=query.dtype,
device=query.device)
hidden_size = output_shape[-1]
# We skip reshaping query, key and value tensors for the MLA
# backend since these tensors have different semantics and are
......
......@@ -326,7 +326,7 @@ class ModelConfig:
"""Whether to disable sliding window. If True, we will disable the sliding
window functionality of the model, capping to sliding window size. If the
model does not support sliding window, this argument is ignored."""
disable_cascade_attn: bool = True
disable_cascade_attn: bool = False
"""Disable cascade attention for V1. While cascade attention does not
change the mathematical correctness, disabling it could be useful for
preventing potential numerical issues. Note that even if this is set to
......
......@@ -1664,7 +1664,7 @@ class EngineArgs:
# TODO(woosuk): Tune the default values for other hardware.
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 8192,
UsageContext.OPENAI_API_SERVER: 2048,
UsageContext.OPENAI_API_SERVER: 10240,
}
default_max_num_seqs = {
UsageContext.LLM_CLASS: 256,
......
......@@ -1107,6 +1107,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_LIGHTOP":
lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
("true", "1")),
# vLLM will use elenmentwise not triton_
"VLLM_USE_OPT_ZEROS":
lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
("true", "1")),
# vLLM will use opt cat for deepseek-v3
"VLLM_USE_OPT_CAT":
lambda: (os.environ.get("VLLM_USE_OPT_CAT", "False").lower() in
......
......@@ -224,7 +224,7 @@ class DeepSeekMTP(nn.Module, SupportsPP):
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
if "rotary_emb.inv_freq" in name or "indexer" in name:
continue
spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
if spec_layer is None:
......
......@@ -234,7 +234,7 @@ class Qwen3MoeAttention(nn.Module):
if envs.VLLM_USE_APEX_RN:
q_by_head = self.q_norm.forward_apex(q_by_head)
else:
q_by_head = self.q_norm(q_by_head)
q_by_head = self.q_norm.forward_cuda(q_by_head)
q = q_by_head.view(q.shape)
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
......@@ -242,7 +242,7 @@ class Qwen3MoeAttention(nn.Module):
if envs.VLLM_USE_APEX_RN:
k_by_head = self.k_norm.forward_apex(k_by_head)
else:
k_by_head = self.k_norm(k_by_head)
k_by_head = self.k_norm.forward_cuda(k_by_head)
k = k_by_head.view(k.shape)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment