Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
90c20079
Unverified
Commit
90c20079
authored
Jan 23, 2026
by
Xin Yang
Committed by
GitHub
Jan 23, 2026
Browse files
[Bugfix] Disable tma_aligned_scales in test_fusions_e2e (#32916)
Signed-off-by:
Xin Yang
<
xyangx@amazon.com
>
parent
d95d6507
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
1 deletion
+9
-1
tests/compile/distributed/test_fusions_e2e.py
tests/compile/distributed/test_fusions_e2e.py
+3
-0
vllm/envs.py
vllm/envs.py
+5
-0
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+1
-1
No files found.
tests/compile/distributed/test_fusions_e2e.py
View file @
90c20079
...
@@ -290,6 +290,9 @@ def test_rms_group_quant(
...
@@ -290,6 +290,9 @@ def test_rms_group_quant(
# Force spawn as it is more general.
# Force spawn as it is more general.
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
monkeypatch
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
# TODO: remove this after fusion is fixed
monkeypatch
.
setenv
(
"VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES"
,
"0"
)
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
model_kwargs
[
"attention_config"
]
=
{
"backend"
:
backend
.
name
}
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
...
...
vllm/envs.py
View file @
90c20079
...
@@ -162,6 +162,7 @@ if TYPE_CHECKING:
...
@@ -162,6 +162,7 @@ if TYPE_CHECKING:
VLLM_USE_DEEP_GEMM
:
bool
=
True
VLLM_USE_DEEP_GEMM
:
bool
=
True
VLLM_MOE_USE_DEEP_GEMM
:
bool
=
True
VLLM_MOE_USE_DEEP_GEMM
:
bool
=
True
VLLM_USE_DEEP_GEMM_E8M0
:
bool
=
True
VLLM_USE_DEEP_GEMM_E8M0
:
bool
=
True
VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES
:
bool
=
True
VLLM_DEEP_GEMM_WARMUP
:
Literal
[
VLLM_DEEP_GEMM_WARMUP
:
Literal
[
"skip"
,
"skip"
,
"full"
,
"full"
,
...
@@ -1201,6 +1202,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1201,6 +1202,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_DEEP_GEMM_E8M0"
:
lambda
:
bool
(
"VLLM_USE_DEEP_GEMM_E8M0"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM_E8M0"
,
"1"
))
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM_E8M0"
,
"1"
))
),
),
# Whether to create TMA-aligned scale tensor when DeepGEMM is used.
"VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES"
,
"1"
))
),
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no
# JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine
# JIT'ing in the hot-path. However, this warmup increases the engine
...
...
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
90c20079
...
@@ -379,7 +379,7 @@ class W8A8BlockFp8LinearOp:
...
@@ -379,7 +379,7 @@ class W8A8BlockFp8LinearOp:
False
,
False
,
self
.
act_quant_group_shape
,
self
.
act_quant_group_shape
,
column_major_scales
=
True
,
column_major_scales
=
True
,
tma_aligned_scales
=
True
,
tma_aligned_scales
=
envs
.
VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES
,
use_ue8m0
=
self
.
use_deep_gemm_e8m0
,
use_ue8m0
=
self
.
use_deep_gemm_e8m0
,
)
)
if
self
.
is_deep_gemm_supported
if
self
.
is_deep_gemm_supported
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment