Commit 4d97c5fc authored by zhuwenwen's avatar zhuwenwen
Browse files

update VLLM_USE_OPT_CAT

parent 8d2cac26
...@@ -1094,15 +1094,15 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1094,15 +1094,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_GLOBAL_CACHE13": "VLLM_USE_GLOBAL_CACHE13":
lambda: (os.environ.get("VLLM_USE_GLOBAL_CACHE13", "False").lower() in lambda: (os.environ.get("VLLM_USE_GLOBAL_CACHE13", "False").lower() in
("true", "1")), ("true", "1")),
# vLLM will use global cache for moe # vLLM will use lightop for deepseek-v3
"VLLM_USE_LIGHTOP": "VLLM_USE_LIGHTOP":
lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in lambda: (os.environ.get("VLLM_USE_LIGHTOP", "False").lower() in
("true", "1")), ("true", "1")),
# vLLM will use global cache for moe # vLLM will use opt cat for deepseek-v3
"VLLM_USE_OPT_CAT": "VLLM_USE_OPT_CAT":
lambda: (os.environ.get("VLLM_USE_OPT_CAT", "True").lower() in lambda: (os.environ.get("VLLM_USE_OPT_CAT", "False").lower() in
("true", "1")), ("true", "1")),
# vLLM will use opt merge_aatn_states,not triton # vLLM will use opt merge_aatn_states, not triton
"VLLM_USE_MERGE_ATTN_STATES_OPT": "VLLM_USE_MERGE_ATTN_STATES_OPT":
lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
("true", "1")), ("true", "1")),
......
...@@ -1442,8 +1442,8 @@ class FusedMoE(torch.nn.Module): ...@@ -1442,8 +1442,8 @@ class FusedMoE(torch.nn.Module):
if current_platform.is_tpu(): if current_platform.is_tpu():
return self.forward_impl(hidden_states, router_logits) return self.forward_impl(hidden_states, router_logits)
else: else:
return torch.ops.vllm.moe_forward(hidden_states, router_logits, shared_output, return torch.ops.vllm.moe_forward(hidden_states, router_logits,
self.layer_name) self.layer_name, shared_output)
def forward_impl_chunked(self, full_hidden_states: torch.Tensor, def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
full_router_logits: torch.Tensor): full_router_logits: torch.Tensor):
......
...@@ -247,6 +247,8 @@ def get_model_architecture( ...@@ -247,6 +247,8 @@ def get_model_architecture(
if (architectures == ['DeepseekV3ForCausalLM'] or architectures == ['DeepSeekMTPModel']): if (architectures == ['DeepseekV3ForCausalLM'] or architectures == ['DeepSeekMTPModel']):
if not envs.is_set("VLLM_USE_LIGHTOP"): if not envs.is_set("VLLM_USE_LIGHTOP"):
os.environ['VLLM_USE_LIGHTOP'] = '1' os.environ['VLLM_USE_LIGHTOP'] = '1'
if not envs.is_set("VLLM_USE_OPT_CAT"):
os.environ['VLLM_USE_OPT_CAT'] = '1'
if os.getenv('GEMM_PAD') != '1': if os.getenv('GEMM_PAD') != '1':
os.environ['GEMM_PAD'] = '0' os.environ['GEMM_PAD'] = '0'
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
import math import math
import vllm.envs as envs import vllm.envs as envs
if envs.VLLM_USE_LIGHTOP: if envs.VLLM_USE_OPT_CAT:
from lightop import ds_cat from lightop import ds_cat
def test_concat_Acc_prefill(shape_pair, dim): def test_concat_Acc_prefill(shape_pair, dim):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment