Commit e7f2785f authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_OPT_ZEROS to replace triton_ (torch.zeros)

set default_max_num_batched_tokens = 10240
update qwen3_moe of layernorm
off lightop of moe_fused_gate
parent 671dcfff
......@@ -297,9 +297,14 @@ class Attention(nn.Module, AttentionLayerBase):
if self.use_output:
output_shape = (output_shape
if output_shape is not None else query.shape)
output = torch.zeros(output_shape,
dtype=output_dtype,
device=query.device)
if envs.VLLM_USE_OPT_ZEROS:
output = torch.empty(output_shape,
dtype=query.dtype,
device=query.device)
else:
output = torch.zeros(output_shape,
dtype=query.dtype,
device=query.device)
hidden_size = output_shape[-1]
# We skip reshaping query, key and value tensors for the MLA
# backend since these tensors have different semantics and are
......
......@@ -199,7 +199,7 @@ class ModelConfig:
"""Whether to disable sliding window. If True, we will disable the sliding
window functionality of the model, capping to sliding window size. If the
model does not support sliding window, this argument is ignored."""
disable_cascade_attn: bool = True
disable_cascade_attn: bool = False
"""Disable cascade attention for V1. While cascade attention does not
change the mathematical correctness, disabling it could be useful for
preventing potential numerical issues. Note that even if this is set to
......
......@@ -1639,7 +1639,7 @@ class EngineArgs:
# TODO(woosuk): Tune the default values for other hardware.
default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 8192,
UsageContext.OPENAI_API_SERVER: 2048,
UsageContext.OPENAI_API_SERVER: 10240,
}
default_max_num_seqs = {
UsageContext.LLM_CLASS: 256,
......
......@@ -227,6 +227,7 @@ if TYPE_CHECKING:
VLLM_USE_APEX_RN: bool = False
VLLM_USE_GLOBAL_CACHE13: bool = False
VLLM_USE_LIGHTOP: bool = False
VLLM_USE_OPT_ZEROS: bool = False
VLLM_USE_OPT_CAT: bool = False
VLLM_USE_OPT_MOE_SUM: bool = False
VLLM_USE_LIGHTOP_MOE_SUM: bool = False
......@@ -1605,6 +1606,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
("true", "1")),
# vLLM will use elenmentwise not triton_
"VLLM_USE_OPT_ZEROS":
lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in
("true", "1")),
# vLLM will use opt cat for deepseek-v3
"VLLM_USE_OPT_CAT":
lambda: (os.environ.get("VLLM_USE_OPT_CAT", "True").lower() in
......
......@@ -1793,7 +1793,8 @@ class FusedMoE(CustomOp):
assert topk_group is not None
assert num_expert_group is not None
if use_fused_gate:
if envs.VLLM_USE_LIGHTOP:
# if envs.VLLM_USE_LIGHTOP:
if False:
from lightop import op as op
topk_weights, topk_ids = op.moe_fused_gate(
router_logits,
......
......@@ -194,8 +194,8 @@ def _get_model_architecture(
os.environ['LM_NN'] = '1'
if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
# if not envs.is_set("VLLM_USE_LIGHTOP"):
# os.environ['VLLM_USE_LIGHTOP'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP"):
os.environ['VLLM_USE_LIGHTOP'] = '1'
if not envs.is_set("VLLM_USE_OPT_CAT"):
os.environ['VLLM_USE_OPT_CAT'] = '1'
......
......@@ -284,7 +284,7 @@ class Qwen3MoeAttention(nn.Module):
if envs.VLLM_USE_APEX_RN:
q_by_head = self.q_norm.forward_apex(q_by_head)
else:
q_by_head = self.q_norm(q_by_head)
q_by_head = self.q_norm.forward_cuda(q_by_head)
q = q_by_head.view(q.shape)
k_by_head = k.view(*k.shape[:-1], k.shape[-1] // self.head_dim,
......@@ -292,7 +292,7 @@ class Qwen3MoeAttention(nn.Module):
if envs.VLLM_USE_APEX_RN:
k_by_head = self.k_norm.forward_apex(k_by_head)
else:
k_by_head = self.k_norm(k_by_head)
k_by_head = self.k_norm.forward_cuda(k_by_head)
k = k_by_head.view(k.shape)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment