Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e9660f3a
Commit
e9660f3a
authored
Dec 18, 2025
by
zhuwenwen
Browse files
skip fp32_precision and static_scaled_fp8_quant, set VLLM_USE_BYTECODE_HOOK=0
parent
c98b6a8f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
14 additions
and
14 deletions
+14
-14
vllm/compilation/matcher_utils.py
vllm/compilation/matcher_utils.py
+3
-3
vllm/envs.py
vllm/envs.py
+1
-1
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+8
-8
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+2
-2
No files found.
vllm/compilation/matcher_utils.py
View file @
e9660f3a
...
@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
...
@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP
=
torch
.
ops
.
vllm
.
flashinfer_rotary_embedding
.
default
FLASHINFER_ROTARY_OP
=
torch
.
ops
.
vllm
.
flashinfer_rotary_embedding
.
default
QUANT_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
QUANT_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
kFp8StaticTensorSym
:
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTensorSym
:
torch
.
ops
.
_C
.
dynamic_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTokenSym
:
torch
.
ops
.
_C
.
dynamic_per_token_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
}
}
if
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"scaled_fp4_quant"
):
if
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"scaled_fp4_quant"
):
...
...
vllm/envs.py
View file @
e9660f3a
...
@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Feature flag to enable/disable bytecode in
# Feature flag to enable/disable bytecode in
# TorchCompileWithNoGuardsWrapper.
# TorchCompileWithNoGuardsWrapper.
"VLLM_USE_BYTECODE_HOOK"
:
lambda
:
bool
(
"VLLM_USE_BYTECODE_HOOK"
:
lambda
:
bool
(
int
(
os
.
environ
.
get
(
"VLLM_USE_BYTECODE_HOOK"
,
"
1
"
))
int
(
os
.
environ
.
get
(
"VLLM_USE_BYTECODE_HOOK"
,
"
0
"
))
),
),
# Force vllm to always load AOT compiled models from disk. Failure
# Force vllm to always load AOT compiled models from disk. Failure
# to load will result in a hard error when this is enabled.
# to load will result in a hard error when this is enabled.
...
...
vllm/platforms/rocm.py
View file @
e9660f3a
...
@@ -195,7 +195,7 @@ class RocmPlatform(Platform):
...
@@ -195,7 +195,7 @@ class RocmPlatform(Platform):
selected_backend
:
"AttentionBackendEnum"
,
selected_backend
:
"AttentionBackendEnum"
,
attn_selector_config
:
"AttentionSelectorConfig"
,
attn_selector_config
:
"AttentionSelectorConfig"
,
)
->
str
:
)
->
str
:
from
vllm._aiter_ops
import
rocm_aiter_ops
#
from vllm._aiter_ops import rocm_aiter_ops
block_size
=
attn_selector_config
.
block_size
block_size
=
attn_selector_config
.
block_size
kv_cache_dtype
=
attn_selector_config
.
kv_cache_dtype
kv_cache_dtype
=
attn_selector_config
.
kv_cache_dtype
...
@@ -285,13 +285,13 @@ class RocmPlatform(Platform):
...
@@ -285,13 +285,13 @@ class RocmPlatform(Platform):
# Priority 4: Check for AITER enabled without specific flags
# Priority 4: Check for AITER enabled without specific flags
# This defaults to AITER FA only if MHA is not explicitly disabled
# This defaults to AITER FA only if MHA is not explicitly disabled
if
(
#
if (
envs
.
VLLM_ROCM_USE_AITER
#
envs.VLLM_ROCM_USE_AITER
and
on_gfx9
()
#
and on_gfx9()
and
envs
.
VLLM_ROCM_USE_AITER_MHA
is
not
False
#
and envs.VLLM_ROCM_USE_AITER_MHA is not False
):
#
):
logger
.
info
(
"Using Aiter Flash Attention backend on V1 engine."
)
#
logger.info("Using Aiter Flash Attention backend on V1 engine.")
return
AttentionBackendEnum
.
ROCM_AITER_FA
.
get_path
()
#
return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Default: Triton Unified Attention
# Default: Triton Unified Attention
logger
.
info
(
"Using Triton Attention backend on V1 engine."
)
logger
.
info
(
"Using Triton Attention backend on V1 engine."
)
...
...
vllm/v1/worker/gpu_worker.py
View file @
e9660f3a
...
@@ -81,8 +81,8 @@ class Worker(WorkerBase):
...
@@ -81,8 +81,8 @@ class Worker(WorkerBase):
)
)
# configure float32 matmul precision according to vLLM env.
# configure float32 matmul precision according to vLLM env.
precision
=
envs
.
VLLM_FLOAT32_MATMUL_PRECISION
#
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch
.
backends
.
cuda
.
matmul
.
fp32_precision
=
precision
#
torch.backends.cuda.matmul.fp32_precision = precision
if
self
.
model_config
.
trust_remote_code
:
if
self
.
model_config
.
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
# note: lazy import to avoid importing torch before initializing
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment