Commit ad60a973 authored by zhuwenwen's avatar zhuwenwen
Browse files

add SUPPORT_MOE_MARLIN_W16A16 to use moe marlin on bw

parent f4cef40c
...@@ -16,6 +16,13 @@ from vllm.utils import cuda_device_count_stateless ...@@ -16,6 +16,13 @@ from vllm.utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
from vllm.utils import SUPPORT_MOE_MARLIN_W16A16
if SUPPORT_MOE_MARLIN_W16A16:
os.environ['VLLM_USE_MARLIN_W16A16_MOE'] = '1'
os.environ['MOE_NN'] = '0'
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
......
...@@ -86,6 +86,10 @@ if TYPE_CHECKING: ...@@ -86,6 +86,10 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
# This value is chosen to have a balance between ITL and TTFT. Note it is # This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput. # not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment