Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bde57ab2
Unverified
Commit
bde57ab2
authored
Jan 11, 2026
by
Matt
Committed by
GitHub
Jan 10, 2026
Browse files
[Hardware][AMD][CI][Bugfix] Fix AMD Quantization test group (#31713)
Signed-off-by:
Matthew Wong
<
Matthew.Wong2@amd.com
>
parent
9103ed16
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
114 additions
and
52 deletions
+114
-52
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+1
-1
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+7
-1
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+31
-6
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+1
-1
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+12
-4
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+5
-1
tests/quantization/test_ptpc_fp8.py
tests/quantization/test_ptpc_fp8.py
+12
-25
tests/quantization/utils.py
tests/quantization/utils.py
+5
-0
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+4
-0
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
..._executor/layers/quantization/kernels/scaled_mm/triton.py
+17
-0
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
+17
-13
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+2
-0
No files found.
.buildkite/test-amd.yaml
View file @
bde57ab2
...
...
@@ -731,7 +731,7 @@ steps:
-
label
:
Quantization Test
# 70min
timeout_in_minutes
:
90
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
source_file_dependencies
:
...
...
tests/quantization/test_compressed_tensors.py
View file @
bde57ab2
...
...
@@ -644,6 +644,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert
output
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"This test is skipped on non-CUDA platform."
)
@
pytest
.
mark
.
parametrize
(
"args"
,
[
...
...
@@ -762,7 +765,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
input_quant_op
=
qkv_proj
.
scheme
.
w8a8_block_fp8_linear
.
input_quant_op
assert
isinstance
(
input_quant_op
,
QuantFP8
)
assert
input_quant_op
.
_forward_method
==
input_quant_op
.
forward_cuda
assert
input_quant_op
.
_forward_method
in
(
input_quant_op
.
forward_cuda
,
input_quant_op
.
forward_hip
,
)
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_configs.py
View file @
bde57ab2
...
...
@@ -10,6 +10,7 @@ from dataclasses import dataclass
import
pytest
from
vllm.config
import
ModelConfig
from
vllm.platforms
import
current_platform
@
dataclass
...
...
@@ -23,20 +24,44 @@ MODEL_ARG_EXPTYPES = [
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Exllama Format.
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
None
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"marlin"
,
"gptq_marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
None
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"gptq"
,
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"marlin"
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
,
"gptq"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"awq"
,
"ERROR"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Exllama Format.
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
None
,
"gptq_marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"marlin"
,
"gptq_marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
None
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"gptq"
,
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"marlin"
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"gptq"
,
"gptq"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"awq"
,
"ERROR"
),
# AUTOAWQ
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
None
,
"awq_marlin"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
None
,
"awq_marlin"
if
current_platform
.
is_cuda
()
else
"awq"
,
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"awq"
,
"awq"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"marlin"
,
"awq_marlin"
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"marlin"
,
"awq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
,
"gptq"
,
"ERROR"
),
]
...
...
tests/quantization/test_cpu_offload.py
View file @
bde57ab2
...
...
@@ -66,7 +66,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_LOAD_FORMAT"
,
"auto"
)
# Test wNa16
compare_two_settings
(
"nm-testing/
tinyllama-oneshot-w4a16-channel-v2
"
,
"nm-testing/
Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16
"
,
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
...
...
tests/quantization/test_fp8.py
View file @
bde57ab2
...
...
@@ -36,7 +36,9 @@ MODELS = [
reason
=
"FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
]
if
current_platform
.
is_rocm
()
else
[
False
,
True
]
)
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
)
...
...
@@ -125,7 +127,9 @@ def test_kv_cache_model_load_and_run(
reason
=
"FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
]
if
current_platform
.
is_rocm
()
else
[
False
,
True
]
)
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
)
...
...
@@ -197,10 +201,10 @@ def test_scaled_fp8_quant(dtype) -> None:
def
quantize_ref
(
tensor
,
inv_scale
):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
finfo
=
torch
.
finfo
(
current_platform
.
fp8_dtype
()
)
scale
=
inv_scale
.
reciprocal
()
qweight
=
(
tensor
.
to
(
torch
.
float32
)
*
scale
).
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)
qweight
=
qweight
.
to
(
torch
.
float8_e4m3fn
)
qweight
=
qweight
.
to
(
current_platform
.
fp8_dtype
()
)
return
qweight
def
per_tensor_dequantize
(
tensor
,
inv_scale
,
dtype
):
...
...
@@ -267,6 +271,10 @@ def test_scaled_fp8_quant(dtype) -> None:
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_fp8_fnuz
(),
reason
=
"FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms"
,
)
@
pytest
.
mark
.
parametrize
(
"method_cls"
,
[
Fp8LinearMethod
,
Fp8MoEMethod
])
# FP8 weight reloading does not support online quantization
@
pytest
.
mark
.
parametrize
(
"is_checkpoint_fp8_serialized"
,
[
True
])
# skip False
...
...
tests/quantization/test_gptq_dynamic.py
View file @
bde57ab2
...
...
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
from
vllm.model_executor.layers.quantization.utils.gptq_utils
import
(
get_dynamic_override
,
)
from
vllm.platforms
import
current_platform
PROMPT
=
"On the surface of Mars, we found"
...
...
@@ -21,7 +22,10 @@ PROMPT = "On the surface of Mars, we found"
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT
=
[
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
True
),
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
current_platform
.
is_cuda
(),
),
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
,
False
,
...
...
tests/quantization/test_ptpc_fp8.py
View file @
bde57ab2
...
...
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
"""
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.quantization.fp8
import
Fp8KVCacheMethod
from
vllm.model_executor.layers.quantization.ptpc_fp8
import
PTPCFp8LinearMethod
from
vllm.platforms
import
current_platform
UNSUPPORTED_STR
=
(
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
"support output dtype of bfloat16. torch.float16 is specified."
)
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
enable_pickle
(
monkeypatch
):
...
...
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
reason
=
"PTPC FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"This test is for ROCm GPU."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
auto"
,
"bfloat16"
,
"
float16"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
,
"fp8_e4m3"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
b
float16"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
def
test_ptpc_fp8_rocm
(
vllm_runner
,
dtype
:
str
,
kv_cache_dtype
:
str
)
->
None
:
try
:
llm
=
vllm_runner
(
"facebook/opt-125m"
,
dtype
=
dtype
,
quantization
=
"ptpc_fp8"
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
allow_deprecated_quantization
=
True
,
)
except
AssertionError
as
e
:
if
str
(
e
)
==
UNSUPPORTED_STR
:
# If the error message matches, the test passes
return
else
:
# If the error message does not match, re-raise the exception
raise
with
llm
:
...
...
@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
if
current_platform
.
has_device_capability
(
94
):
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fnuz
if
current_platform
.
has_device_capability
(
94
):
assert
fc1
.
weight
.
dtype
==
current_platform
.
fp8_dtype
()
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/utils.py
View file @
bde57ab2
...
...
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
if
not
(
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()):
return
False
try
:
current_platform
.
verify_quantization
(
quant_method
)
except
ValueError
:
return
False
capability
=
current_platform
.
get_device_capability
()
assert
capability
is
not
None
...
...
vllm/model_executor/layers/quantization/__init__.py
View file @
bde57ab2
...
...
@@ -5,6 +5,7 @@ from typing import Literal, get_args
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization.base_config
import
QuantizationConfig
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -98,6 +99,9 @@ def register_quantization_config(quantization: str):
)
else
:
QUANTIZATION_METHODS
.
append
(
quantization
)
# Automatically assume the custom quantization config is supported
if
sq
:
=
current_platform
.
supported_quantization
:
sq
.
append
(
quantization
)
if
not
issubclass
(
quant_config_cls
,
QuantizationConfig
):
raise
ValueError
(
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
View file @
bde57ab2
...
...
@@ -9,6 +9,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm
triton_scaled_mm
,
)
from
vllm.model_executor.layers.quantization.utils
import
replace_parameter
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
convert_to_channelwise
,
)
from
vllm.platforms
import
current_platform
from
.ScaledMMLinearKernel
import
ScaledMMLinearKernel
,
ScaledMMLinearLayerConfig
...
...
@@ -37,6 +40,20 @@ class TritonScaledMMLinearKernel(ScaledMMLinearKernel):
torch
.
nn
.
Parameter
(
weight
.
t
().
data
,
requires_grad
=
False
),
)
# WEIGHT SCALE
# Triton kernel supports only per-tensor and per-channel.
# If we have a fused module (QKV, MLP) with per tensor scales (thus N
# scales being passed to the kernel), convert to the per-channel case.
is_fused_module
=
len
(
layer
.
logical_widths
)
>
1
weight_scale
=
getattr
(
layer
,
self
.
w_s_name
)
if
is_fused_module
and
not
self
.
config
.
is_channelwise
:
weight_scale
=
convert_to_channelwise
(
weight_scale
,
layer
.
logical_widths
)
replace_parameter
(
layer
,
self
.
w_s_name
,
torch
.
nn
.
Parameter
(
weight_scale
.
data
,
requires_grad
=
False
),
)
# INPUT SCALE
if
self
.
config
.
is_static_input_scheme
:
input_scale
=
getattr
(
layer
,
self
.
i_s_name
)
...
...
vllm/model_executor/layers/quantization/ptpc_fp8.py
View file @
bde57ab2
...
...
@@ -103,11 +103,12 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
weight
=
torch
.
nn
.
Parameter
(
layer
.
weight
.
data
,
requires_grad
=
False
)
assert
layer
.
weight
.
data
.
dtype
==
torch
.
bfloat16
,
(
f
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16.
{
str
(
layer
.
weight
.
data
.
dtype
)
}
is specified."
# noqa: E501
assert
layer
.
weight
.
data
.
dtype
not
in
(
torch
.
float16
,
torch
.
float32
),
(
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support "
f
"output dtype of bfloat16.
{
layer
.
weight
.
data
.
dtype
}
is specified."
)
if
layer
.
weight
.
data
.
dtype
==
torch
.
bfloat16
:
# Quantize the weights.
qweight
,
weight_scale
=
ops
.
scaled_fp8_quant
(
layer
.
weight
,
scale
=
None
,
use_per_token_if_dynamic
=
True
...
...
@@ -118,6 +119,9 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
qweight
.
t
(),
requires_grad
=
False
)
# Pretranspose the weight
layer
.
weight_scale
=
Parameter
(
weight_scale
,
requires_grad
=
False
)
else
:
assert
layer
.
weight
.
data
.
dtype
==
current_platform
.
fp8_dtype
()
assert
getattr
(
layer
,
"weight_scale"
,
None
)
is
not
None
layer
.
input_scale
=
None
def
apply
(
...
...
vllm/platforms/rocm.py
View file @
bde57ab2
...
...
@@ -170,7 +170,9 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"awq_marlin"
,
# will be overwritten with awq
"gptq"
,
"gptq_marlin"
,
# will be overwritten with gptq
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment