"vscode:/vscode.git/clone" did not exist on "81ecf425f0a645e6d6f1c339ba79ef37e95a9569"
Commit 6216b12d authored by zhuwenwen's avatar zhuwenwen
Browse files

remove SUPPORT_MOE_MARLIN_W16A16

parent 1a26d0b6
...@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless ...@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
from vllm.utils import SUPPORT_MOE_MARLIN_W16A16
if SUPPORT_MOE_MARLIN_W16A16:
os.environ['VLLM_USE_MARLIN_W16A16_MOE'] = '1'
os.environ['MOE_NN'] = '0'
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
......
...@@ -86,10 +86,6 @@ if TYPE_CHECKING: ...@@ -86,10 +86,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
# This value is chosen to have a balance between ITL and TTFT. Note it is # This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput. # not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment