Commit e9660f3a authored by zhuwenwen's avatar zhuwenwen
Browse files

skip fp32_precision and static_scaled_fp8_quant, set VLLM_USE_BYTECODE_HOOK=0

parent c98b6a8f
......@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
QUANT_OPS: dict[QuantKey, OpOverload] = {
kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
# kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
# kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
# kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
}
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
......
......@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Feature flag to enable/disable bytecode in
# TorchCompileWithNoGuardsWrapper.
"VLLM_USE_BYTECODE_HOOK": lambda: bool(
int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "0"))
),
# Force vllm to always load AOT compiled models from disk. Failure
# to load will result in a hard error when this is enabled.
......
......@@ -195,7 +195,7 @@ class RocmPlatform(Platform):
selected_backend: "AttentionBackendEnum",
attn_selector_config: "AttentionSelectorConfig",
) -> str:
from vllm._aiter_ops import rocm_aiter_ops
# from vllm._aiter_ops import rocm_aiter_ops
block_size = attn_selector_config.block_size
kv_cache_dtype = attn_selector_config.kv_cache_dtype
......@@ -285,13 +285,13 @@ class RocmPlatform(Platform):
# Priority 4: Check for AITER enabled without specific flags
# This defaults to AITER FA only if MHA is not explicitly disabled
if (
envs.VLLM_ROCM_USE_AITER
and on_gfx9()
and envs.VLLM_ROCM_USE_AITER_MHA is not False
):
logger.info("Using Aiter Flash Attention backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# if (
# envs.VLLM_ROCM_USE_AITER
# and on_gfx9()
# and envs.VLLM_ROCM_USE_AITER_MHA is not False
# ):
# logger.info("Using Aiter Flash Attention backend on V1 engine.")
# return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Default: Triton Unified Attention
logger.info("Using Triton Attention backend on V1 engine.")
......
......@@ -81,8 +81,8 @@ class Worker(WorkerBase):
)
# configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.backends.cuda.matmul.fp32_precision = precision
# precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
# torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment