Unverified Commit 510ed1e8 authored by XiongfeiWei's avatar XiongfeiWei Committed by GitHub
Browse files

[Bugfix][TPU] Return a Default fp8 MoE Backend (#32908)


Signed-off-by: default avatarXiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: default avatarRobert Shaw <robshaw@redhat.com>
Co-authored-by: default avatarRobert Shaw <robshaw@redhat.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
parent 8caffd92
...@@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( ...@@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey, QuantKey,
) )
from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -330,10 +331,17 @@ def select_fp8_moe_backend( ...@@ -330,10 +331,17 @@ def select_fp8_moe_backend(
else: else:
logger.debug_once(_make_log_unsupported(backend, reason), scope="local") logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
# TODO(rob): per discussion with TPU team, we need a way to register
# MoE backends by OOT plugins, rather than having an explicit list
# of AVAILBLE_BACKENDS. Enabling returning `Fp8MoeBackend.NONE` is
# a temporary measure until these register APIs are complete.
if current_platform.is_cuda() or current_platform.is_rocm():
raise NotImplementedError( raise NotImplementedError(
"No FP8 MoE backend supports the deployment configuration." "No FP8 MoE backend supports the deployment configuration."
) )
return Fp8MoeBackend.NONE, None
def convert_to_fp8_moe_kernel_format( def convert_to_fp8_moe_kernel_format(
fp8_backend: Fp8MoeBackend, fp8_backend: Fp8MoeBackend,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment