Commit e9660f3a authored by zhuwenwen's avatar zhuwenwen
Browse files

skip fp32_precision and static_scaled_fp8_quant, set VLLM_USE_BYTECODE_HOOK=0

parent c98b6a8f
...@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default ...@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
QUANT_OPS: dict[QuantKey, OpOverload] = { QUANT_OPS: dict[QuantKey, OpOverload] = {
kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501 # kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 # kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 # kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
} }
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
......
...@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Feature flag to enable/disable bytecode in # Feature flag to enable/disable bytecode in
# TorchCompileWithNoGuardsWrapper. # TorchCompileWithNoGuardsWrapper.
"VLLM_USE_BYTECODE_HOOK": lambda: bool( "VLLM_USE_BYTECODE_HOOK": lambda: bool(
int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1")) int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "0"))
), ),
# Force vllm to always load AOT compiled models from disk. Failure # Force vllm to always load AOT compiled models from disk. Failure
# to load will result in a hard error when this is enabled. # to load will result in a hard error when this is enabled.
......
...@@ -195,7 +195,7 @@ class RocmPlatform(Platform): ...@@ -195,7 +195,7 @@ class RocmPlatform(Platform):
selected_backend: "AttentionBackendEnum", selected_backend: "AttentionBackendEnum",
attn_selector_config: "AttentionSelectorConfig", attn_selector_config: "AttentionSelectorConfig",
) -> str: ) -> str:
from vllm._aiter_ops import rocm_aiter_ops # from vllm._aiter_ops import rocm_aiter_ops
block_size = attn_selector_config.block_size block_size = attn_selector_config.block_size
kv_cache_dtype = attn_selector_config.kv_cache_dtype kv_cache_dtype = attn_selector_config.kv_cache_dtype
...@@ -285,13 +285,13 @@ class RocmPlatform(Platform): ...@@ -285,13 +285,13 @@ class RocmPlatform(Platform):
# Priority 4: Check for AITER enabled without specific flags # Priority 4: Check for AITER enabled without specific flags
# This defaults to AITER FA only if MHA is not explicitly disabled # This defaults to AITER FA only if MHA is not explicitly disabled
if ( # if (
envs.VLLM_ROCM_USE_AITER # envs.VLLM_ROCM_USE_AITER
and on_gfx9() # and on_gfx9()
and envs.VLLM_ROCM_USE_AITER_MHA is not False # and envs.VLLM_ROCM_USE_AITER_MHA is not False
): # ):
logger.info("Using Aiter Flash Attention backend on V1 engine.") # logger.info("Using Aiter Flash Attention backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path() # return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Default: Triton Unified Attention # Default: Triton Unified Attention
logger.info("Using Triton Attention backend on V1 engine.") logger.info("Using Triton Attention backend on V1 engine.")
......
...@@ -81,8 +81,8 @@ class Worker(WorkerBase): ...@@ -81,8 +81,8 @@ class Worker(WorkerBase):
) )
# configure float32 matmul precision according to vLLM env. # configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION # precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.backends.cuda.matmul.fp32_precision = precision # torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment