Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f1eb27b8
Commit
f1eb27b8
authored
Apr 16, 2026
by
zhaosong
Committed by
zhangzbb
Apr 16, 2026
Browse files
support v0.11.0 online int8/fp8 quantization
parent
49a30c70
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
467 additions
and
43 deletions
+467
-43
csrc/ops.h
csrc/ops.h
+5
-5
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+13
-28
vllm/_custom_ops.py
vllm/_custom_ops.py
+4
-3
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+3
-0
vllm/model_executor/layers/quantization/blockwise_int8.py
vllm/model_executor/layers/quantization/blockwise_int8.py
+1
-0
vllm/model_executor/layers/quantization/dcu_int8.py
vllm/model_executor/layers/quantization/dcu_int8.py
+433
-0
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+1
-0
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
+5
-5
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+1
-1
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-1
No files found.
csrc/ops.h
View file @
f1eb27b8
...
...
@@ -321,12 +321,12 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
void
static_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
const
&
scale
);
//
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
//
torch::Tensor& scale);
void
dynamic_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
&
scale
);
//
void dynamic_per_token_scaled_fp8_quant(
//
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
//
std::optional<torch::Tensor> const& scale_ub);
void
dynamic_per_token_scaled_fp8_quant
(
torch
::
Tensor
&
out
,
torch
::
Tensor
const
&
input
,
torch
::
Tensor
&
scale
,
std
::
optional
<
torch
::
Tensor
>
const
&
scale_ub
);
void
selective_scan_fwd
(
const
torch
::
Tensor
&
u
,
const
torch
::
Tensor
&
delta
,
const
torch
::
Tensor
&
A
,
const
torch
::
Tensor
&
B
,
...
...
csrc/torch_bindings.cpp
View file @
f1eb27b8
...
...
@@ -594,20 +594,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// "()");
// ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
//
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
//
ops.def(
//
"dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
//
"-> "
//
"()");
//
ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
ops
.
def
(
"dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
"-> "
"()"
);
ops
.
impl
(
"dynamic_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
dynamic_scaled_fp8_quant
);
//
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
//
ops.def(
//
"dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
//
"Tensor! scale, Tensor? scale_ub) -> "
//
"()");
//
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
//
&dynamic_per_token_scaled_fp8_quant);
// Compute dynamic-per-token FP8 quantized tensor and scaling factor.
ops
.
def
(
"dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
"Tensor! scale, Tensor? scale_ub) -> "
"()"
);
ops
.
impl
(
"dynamic_per_token_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
dynamic_per_token_scaled_fp8_quant
);
// Compute int8 quantized tensor for given scaling factor.
ops
.
def
(
...
...
@@ -615,21 +615,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"()"
);
ops
.
impl
(
"static_scaled_fp8_quant"
,
torch
::
kCUDA
,
&
static_scaled_fp8_quant
);
// // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
// ops.def(
// "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
// "-> "
// "()");
// ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
// // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
// ops.def(
// "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
// "Tensor! scale, Tensor? scale_ub) -> "
// "()");
// ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
// &dynamic_per_token_scaled_fp8_quant);
// Compute int8 quantized tensor for given scaling factor.
ops
.
def
(
"static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
...
...
vllm/_custom_ops.py
View file @
f1eb27b8
...
...
@@ -1419,9 +1419,10 @@ def scaled_fp8_quant(
scale
=
torch
.
empty
((
shape
[
0
],
1
),
device
=
input
.
device
,
dtype
=
torch
.
float32
)
# torch.ops._C.dynamic_per_token_scaled_fp8_quant(
# output, input.contiguous(), scale, scale_ub)
output
,
scale
=
per_token_quant_fp8
(
input
.
contiguous
())
torch
.
ops
.
_C
.
dynamic_per_token_scaled_fp8_quant
(
output
,
input
.
contiguous
(),
scale
,
scale_ub
)
# per_token_quant_fp8 has precision problem.
# output, scale = per_token_quant_fp8(input.contiguous())
else
:
scale
=
torch
.
zeros
(
1
,
device
=
input
.
device
,
dtype
=
torch
.
float32
)
torch
.
ops
.
_C
.
dynamic_scaled_fp8_quant
(
output
,
input
,
scale
)
...
...
vllm/model_executor/layers/quantization/__init__.py
View file @
f1eb27b8
...
...
@@ -11,6 +11,7 @@ QuantizationMethods = Literal[
"deepspeedfp"
,
"tpu_int8"
,
"fp8"
,
"dcu_int8"
,
"ptpc_fp8"
,
"fbgemm_fp8"
,
"modelopt"
,
...
...
@@ -103,6 +104,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from
.experts_int8
import
ExpertsInt8Config
from
.fbgemm_fp8
import
FBGEMMFp8Config
from
.fp8
import
Fp8Config
from
.dcu_int8
import
DcuInt8Config
from
.gguf
import
GGUFConfig
from
.gptq
import
GPTQConfig
from
.gptq_bitblas
import
GPTQBitBLASConfig
...
...
@@ -128,6 +130,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"deepspeedfp"
:
DeepSpeedFPConfig
,
"tpu_int8"
:
Int8TpuConfig
,
"fp8"
:
Fp8Config
,
"dcu_int8"
:
DcuInt8Config
,
"fbgemm_fp8"
:
FBGEMMFp8Config
,
"modelopt"
:
ModelOptFp8Config
,
"modelopt_fp4"
:
ModelOptNvFp4Config
,
...
...
vllm/model_executor/layers/quantization/blockwise_int8.py
View file @
f1eb27b8
...
...
@@ -134,6 +134,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
def
__init__
(
self
,
quant_config
:
BlockInt8Config
):
self
.
quant_config
=
quant_config
raise
ValueError
(
vars
(
quant_config
))
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
block_size
=
self
.
quant_config
.
weight_block_size
...
...
vllm/model_executor/layers/quantization/dcu_int8.py
0 → 100644
View file @
f1eb27b8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Optional
,
Union
import
torch
from
torch.nn
import
Module
from
torch.nn.parameter
import
Parameter
import
vllm.envs
as
envs
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe
import
(
FusedMoE
,
FusedMoEActivationFormat
,
FusedMoEMethodBase
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
,
FusedMoeWeightScaleSupported
)
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEQuantConfig
)
from
vllm.model_executor.layers.fused_moe.layer
import
(
UnquantizedFusedMoEMethod
)
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
UnquantizedLinearMethod
)
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
)
from
vllm.model_executor.layers.quantization.kv_cache
import
BaseKVCacheMethod
from
vllm.model_executor.layers.quantization.utils.flashinfer_utils
import
(
FlashinferMoeBackend
,
get_flashinfer_moe_backend
,
register_moe_scaling_factors
,
swap_w13_to_w31
)
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
apply_fp8_block_linear
,
check_aiter_fp8_linear_support
,
create_fp8_input_scale
,
create_fp8_scale_parameter
,
create_fp8_weight_parameter
,
expert_weight_is_col_major
,
maybe_post_process_fp8_weight_block
,
process_fp8_weight_block_strategy
,
process_fp8_weight_tensor_strategy
,
requant_weight_ue8m0_inplace
,
validate_fp8_block_shape
)
from
vllm.model_executor.layers.quantization.utils.marlin_utils_fp8
import
(
apply_fp8_marlin_linear
,
prepare_fp8_layer_for_marlin
,
prepare_moe_fp8_layer_for_marlin
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
is_layer_skipped
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
Fp8LinearOp
,
all_close_1d
,
cutlass_block_fp8_supported
,
cutlass_fp8_supported
,
maybe_create_device_identity
,
normalize_e4m3fn_to_e4m3fnuz
,
per_tensor_dequantize
)
from
vllm.model_executor.parameter
import
(
BlockQuantScaleParameter
,
ModelWeightParameter
,
PerTensorScaleParameter
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
has_deep_gemm
from
vllm.utils.deep_gemm
import
(
get_col_major_tma_aligned_tensor
,
is_deep_gemm_e8m0_used
,
is_deep_gemm_supported
)
from
vllm.utils.flashinfer
import
has_flashinfer_moe
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
apply_int8_linear
if
TYPE_CHECKING
:
from
vllm.model_executor.models.utils
import
WeightsMapper
ACTIVATION_SCHEMES
=
[
"static"
,
"dynamic"
]
logger
=
init_logger
(
__name__
)
class
DcuInt8Config
(
QuantizationConfig
):
"""Config class for DcuInt8."""
def
__init__
(
self
,
is_checkpoint_fp8_serialized
:
bool
=
False
,
activation_scheme
:
str
=
"dynamic"
,
ignored_layers
:
Optional
[
list
[
str
]]
=
None
,
weight_block_size
:
Optional
[
list
[
int
]]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
raise
ValueError
(
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
self
.
activation_scheme
=
activation_scheme
self
.
ignored_layers
=
ignored_layers
or
[]
if
weight_block_size
is
not
None
:
if
not
is_checkpoint_fp8_serialized
:
raise
ValueError
(
"The block-wise quantization only supports fp8-serialized "
"checkpoint for now."
)
if
len
(
weight_block_size
)
!=
2
:
raise
ValueError
(
"The quantization block size of weight must have 2 "
f
"dimensions, but got
{
len
(
weight_block_size
)
}
dimensions"
)
if
activation_scheme
!=
"dynamic"
:
raise
ValueError
(
"The block-wise quantization only supports "
"dynamic activation scheme for now, but got "
f
"
{
activation_scheme
}
activation scheme."
)
self
.
weight_block_size
=
weight_block_size
@
classmethod
def
get_name
(
cls
)
->
QuantizationMethods
:
return
"fp8"
@
classmethod
def
get_supported_act_dtypes
(
cls
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
bfloat16
,
torch
.
half
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
return
80
@
classmethod
def
get_config_filenames
(
cls
)
->
list
[
str
]:
return
[]
def
apply_vllm_mapper
(
self
,
hf_to_vllm_mapper
:
"WeightsMapper"
):
if
self
.
ignored_layers
is
not
None
:
self
.
ignored_layers
=
hf_to_vllm_mapper
.
apply_list
(
self
.
ignored_layers
)
@
classmethod
def
from_config
(
cls
,
config
:
dict
[
str
,
Any
])
->
"DcuInt8Config"
:
quant_method
=
cls
.
get_from_keys
(
config
,
[
"quant_method"
])
is_checkpoint_fp8_serialized
=
(
"fp8"
in
quant_method
)
activation_scheme
=
cls
.
get_from_keys
(
config
,
[
"activation_scheme"
])
ignored_layers
=
cls
.
get_from_keys_or
(
config
,
[
"ignored_layers"
],
None
)
weight_block_size
=
cls
.
get_from_keys_or
(
config
,
[
"weight_block_size"
],
None
)
if
not
ignored_layers
:
ignored_layers
=
cls
.
get_from_keys_or
(
config
,
[
"modules_to_not_convert"
],
None
)
return
cls
(
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
,
activation_scheme
=
activation_scheme
,
ignored_layers
=
ignored_layers
,
weight_block_size
=
weight_block_size
)
def
get_xpu_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
from
vllm.attention.layer
import
Attention
from
vllm.model_executor.layers.quantization.ipex_quant
import
(
XPUFp8LinearMethod
,
XPUFp8MoEMethod
)
fp8_config
=
DcuInt8Config
(
is_checkpoint_fp8_serialized
=
self
.
is_checkpoint_fp8_serialized
,
activation_scheme
=
self
.
activation_scheme
,
ignored_layers
=
self
.
ignored_layers
,
weight_block_size
=
self
.
weight_block_size
)
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
=
prefix
,
ignored_layers
=
self
.
ignored_layers
,
fused_mapping
=
self
.
packed_modules_mapping
):
return
UnquantizedLinearMethod
()
return
XPUFp8LinearMethod
(
fp8_config
)
elif
isinstance
(
layer
,
FusedMoE
):
return
XPUFp8MoEMethod
(
fp8_config
,
layer
)
elif
isinstance
(
layer
,
Attention
):
return
Fp8KVCacheMethod
(
self
)
return
None
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
from
vllm.attention.layer
import
Attention
# Avoid circular import
if
current_platform
.
is_xpu
():
return
self
.
get_xpu_quant_method
(
layer
,
prefix
)
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
=
prefix
,
ignored_layers
=
self
.
ignored_layers
,
fused_mapping
=
self
.
packed_modules_mapping
):
return
UnquantizedLinearMethod
()
return
DcuInt8LinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
if
is_layer_skipped
(
prefix
=
prefix
,
ignored_layers
=
self
.
ignored_layers
,
fused_mapping
=
self
.
packed_modules_mapping
):
return
UnquantizedFusedMoEMethod
(
layer
.
moe_config
)
return
Fp8MoEMethod
(
self
,
layer
)
elif
isinstance
(
layer
,
Attention
):
return
Fp8KVCacheMethod
(
self
)
return
None
def
get_cache_scale
(
self
,
name
:
str
)
->
Optional
[
str
]:
"""
Check whether the param name matches the format for k/v cache scales
in compressed-tensors. If this is the case, return its equivalent
param name expected by vLLM
:param name: param name
:return: matching param name for KV cache scale in vLLM
"""
if
name
.
endswith
(
".output_scale"
)
and
".k_proj"
in
name
:
return
name
.
replace
(
".k_proj.output_scale"
,
".attn.k_scale"
)
if
name
.
endswith
(
".output_scale"
)
and
".v_proj"
in
name
:
return
name
.
replace
(
".v_proj.output_scale"
,
".attn.v_scale"
)
if
name
.
endswith
(
".output_scale"
)
and
".q_proj"
in
name
:
return
name
.
replace
(
".q_proj.output_scale"
,
".attn.q_scale"
)
if
name
.
endswith
(
"self_attn.prob_output_scale"
):
return
name
.
replace
(
".prob_output_scale"
,
".attn.prob_scale"
)
# If no matches, return None
return
None
class
DcuInt8LinearMethod
(
LinearMethodBase
):
"""Linear method for FP8.
Supports loading FP8 checkpoints with static weight scale and
dynamic/static activation scale.
Also supports loading quantized FP16/BF16 model checkpoints with dynamic
activation scaling. The weight scaling factor will be initialized after
the model weights are loaded.
Limitations:
1. Only support per-tensor quantization due to torch._scaled_mm support.
2. Only support float8_e4m3fn data type due to the limitation of
torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
Args:
quant_config: The quantization config.
"""
def
__init__
(
self
,
quant_config
:
DcuInt8Config
):
self
.
quant_config
=
quant_config
self
.
cutlass_block_fp8_supported
=
cutlass_block_fp8_supported
()
self
.
out_dtype
=
torch
.
get_default_dtype
()
# For GPUs that lack FP8 hardware support, we can leverage the Marlin
# kernel for fast weight-only FP8 quantization
self
.
use_marlin
=
(
not
current_platform
.
has_device_capability
(
89
)
or
envs
.
VLLM_TEST_FORCE_FP8_MARLIN
)
# Disable marlin for rocm
if
current_platform
.
is_rocm
():
self
.
use_marlin
=
False
self
.
use_aiter_and_is_supported
=
check_aiter_fp8_linear_support
()
self
.
weight_block_size
=
self
.
quant_config
.
weight_block_size
self
.
block_quant
=
self
.
weight_block_size
is
not
None
self
.
act_q_static
=
self
.
quant_config
.
activation_scheme
==
"static"
# Use per-token quantization for better perf if dynamic and cutlass
if
not
self
.
act_q_static
and
cutlass_fp8_supported
():
self
.
act_q_group_shape
=
GroupShape
.
PER_TOKEN
else
:
self
.
act_q_group_shape
=
GroupShape
.
PER_TENSOR
self
.
fp8_linear
=
Fp8LinearOp
(
act_quant_static
=
self
.
act_q_static
,
act_quant_group_shape
=
self
.
act_q_group_shape
)
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
list
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
maybe_create_device_identity
()
output_size_per_partition
=
sum
(
output_partition_sizes
)
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
layer
.
logical_widths
=
output_partition_sizes
layer
.
input_size_per_partition
=
input_size_per_partition
layer
.
output_size_per_partition
=
output_size_per_partition
layer
.
orig_dtype
=
params_dtype
layer
.
weight_block_size
=
None
if
self
.
block_quant
:
assert
self
.
weight_block_size
is
not
None
layer
.
weight_block_size
=
self
.
weight_block_size
validate_fp8_block_shape
(
layer
,
input_size
,
output_size
,
input_size_per_partition
,
output_partition_sizes
,
self
.
weight_block_size
)
# WEIGHT
if
self
.
quant_config
.
is_checkpoint_fp8_serialized
:
weight
=
create_fp8_weight_parameter
(
output_size_per_partition
,
input_size_per_partition
,
weight_loader
)
else
:
# For non-serialized checkpoints, use original dtype
weight
=
ModelWeightParameter
(
data
=
torch
.
empty
(
output_size_per_partition
,
input_size_per_partition
,
dtype
=
params_dtype
),
input_dim
=
1
,
output_dim
=
0
,
weight_loader
=
weight_loader
)
layer
.
register_parameter
(
"weight"
,
weight
)
# If checkpoint is serialized fp8, load them.
# Otherwise, wait until process_weights_after_loading.
if
self
.
quant_config
.
is_checkpoint_fp8_serialized
:
# WEIGHT SCALE
if
not
self
.
block_quant
:
scale
=
create_fp8_scale_parameter
(
PerTensorScaleParameter
,
output_partition_sizes
,
input_size_per_partition
,
None
,
weight_loader
)
set_weight_attrs
(
scale
,
{
"scale_type"
:
"weight_scale"
})
layer
.
register_parameter
(
"weight_scale"
,
scale
)
else
:
assert
not
self
.
act_q_static
assert
self
.
weight_block_size
is
not
None
scale
=
create_fp8_scale_parameter
(
BlockQuantScaleParameter
,
output_partition_sizes
,
input_size_per_partition
,
self
.
weight_block_size
,
weight_loader
)
set_weight_attrs
(
scale
,
{
"scale_type"
:
"weight_scale"
})
# The weight_scale_inv name is intentional for deepseekv3
layer
.
register_parameter
(
"weight_scale_inv"
,
scale
)
# INPUT ACTIVATION SCALE
if
self
.
act_q_static
:
scale
=
create_fp8_input_scale
(
output_partition_sizes
,
weight_loader
)
set_weight_attrs
(
scale
,
{
"scale_type"
:
"input_scale"
})
layer
.
register_parameter
(
"input_scale"
,
scale
)
else
:
layer
.
register_parameter
(
"input_scale"
,
None
)
def
process_weights_after_loading
(
self
,
layer
:
Module
)
->
None
:
size_k_first
=
True
input_scale
=
None
# TODO(rob): refactor block quant into separate class.
if
self
.
block_quant
:
assert
not
self
.
act_q_static
size_k_first
=
False
weight
,
weight_scale
=
process_fp8_weight_block_strategy
(
layer
.
weight
,
layer
.
weight_scale_inv
)
# Delete the weight_scale_inv parameter to avoid confusion
# with the weight_scale parameter
del
layer
.
weight_scale_inv
# If checkpoint not serialized fp8, quantize the weights.
elif
not
self
.
quant_config
.
is_checkpoint_fp8_serialized
:
# # 先计算 per-channel scale(每个 output channel 取绝对值最大值)
# weight_scale = layer.weight.abs().max(dim=1, keepdim=True).values / 127.0
# print("111111111111111111111111")
qweight
,
weight_scale
,
_
=
ops
.
scaled_int8_quant
(
layer
.
weight
,
scale
=
None
,)
# weight = qweight.t().contiguous()
weight
=
qweight
.
contiguous
()
# If checkpoint is fp8 per-tensor, handle that there are N scales for N
# shards in a fused module
else
:
weight
=
layer
.
weight
weight_scale
=
layer
.
weight_scale
# If using w8a8, torch._scaled_mm needs per tensor, so
# requantize the logical shards as a single weight.
if
not
self
.
use_marlin
:
weight
,
weight_scale
,
input_scale
=
(
process_fp8_weight_tensor_strategy
(
weight
,
weight_scale
,
layer
.
logical_widths
,
getattr
(
layer
,
'input_scale'
,
None
)))
if
self
.
act_q_static
:
assert
input_scale
is
not
None
input_scale
=
input_scale
.
max
()
weight
=
weight
.
t
()
# Update layer with new values.
layer
.
weight
=
Parameter
(
weight
.
data
,
requires_grad
=
False
)
layer
.
weight_scale
=
Parameter
(
weight_scale
.
data
,
requires_grad
=
False
)
layer
.
input_scale
=
Parameter
(
input_scale
,
requires_grad
=
False
)
if
input_scale
is
not
None
else
None
if
self
.
use_marlin
:
prepare_fp8_layer_for_marlin
(
layer
,
size_k_first
)
# Activations not quantized for marlin.
del
layer
.
input_scale
return
if
self
.
block_quant
:
maybe_post_process_fp8_weight_block
(
layer
,
self
.
cutlass_block_fp8_supported
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
# print("xdtype: ", x.dtype)
if
self
.
use_marlin
:
return
apply_fp8_marlin_linear
(
input
=
x
,
weight
=
layer
.
weight
,
weight_scale
=
layer
.
weight_scale
,
workspace
=
layer
.
workspace
,
size_n
=
layer
.
output_size_per_partition
,
size_k
=
layer
.
input_size_per_partition
,
bias
=
bias
)
if
self
.
block_quant
:
return
apply_fp8_block_linear
(
layer
,
input
=
x
,
bias
=
bias
,
cutlass_block_fp8_supported
=
self
.
cutlass_block_fp8_supported
,
use_aiter_and_is_supported
=
self
.
use_aiter_and_is_supported
)
return
apply_int8_linear
(
input
=
x
,
weight
=
layer
.
weight
,
weight_scale
=
layer
.
weight_scale
,
input_scale
=
layer
.
input_scale
,
bias
=
bias
,
w8a8_strategy
=
3
)
# return self.fp8_linear.apply(input=x,
# weight=layer.weight,
# weight_scale=layer.weight_scale,
# out_dtype=self.out_dtype,
# input_scale=layer.input_scale,
# bias=bias)
class
Fp8KVCacheMethod
(
BaseKVCacheMethod
):
"""
Supports loading kv-cache scaling factors from FP8 checkpoints.
"""
def
__init__
(
self
,
quant_config
:
DcuInt8Config
):
super
().
__init__
(
quant_config
)
vllm/model_executor/layers/quantization/fp8.py
View file @
f1eb27b8
...
...
@@ -79,6 +79,7 @@ class Fp8Config(QuantizationConfig):
weight_block_size
:
Optional
[
list
[
int
]]
=
None
,
)
->
None
:
super
().
__init__
()
# raise ValueError(weight_block_size)
self
.
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
...
...
vllm/model_executor/layers/quantization/ptpc_fp8.py
View file @
f1eb27b8
...
...
@@ -39,10 +39,10 @@ class PTPCFp8Config(Fp8Config):
raise
ValueError
(
"ptpc_fp8 quantization is supported only on ROCm."
)
if
not
current_platform
.
has_device_capability
(
94
):
raise
ValueError
(
"ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."
# noqa: E501
)
#
if not current_platform.has_device_capability(94):
#
raise ValueError(
#
"ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer." # noqa: E501
#
)
if
activation_scheme
==
"static"
:
raise
ValueError
(
"ptpc_fp8 as of now only support dynamic quantization."
)
...
...
@@ -112,7 +112,7 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
# Update the layer with the new values.
layer
.
weight
=
Parameter
(
qweight
.
t
(),
requires_grad
=
False
)
# Pretranspose the weight
qweight
.
contiguous
(),
requires_grad
=
False
)
# Pretranspose the weight
layer
.
weight_scale
=
Parameter
(
weight_scale
,
requires_grad
=
False
)
layer
.
input_scale
=
None
...
...
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
f1eb27b8
...
...
@@ -495,7 +495,7 @@ def apply_int8_linear(
# ops.scaled_int8_quant supports both dynamic and static quant.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
# print(1111)
symmetric
=
azp_adj
is
None
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
...
...
vllm/platforms/rocm.py
View file @
f1eb27b8
...
...
@@ -189,7 +189,7 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"quark"
,
"ptpc_fp8"
,
"mxfp4"
,
"petit_nvfp4"
,
"torchao"
,
"quark"
,
"ptpc_fp8"
,
"mxfp4"
,
"petit_nvfp4"
,
"torchao"
,
"dcu_int8"
,
"moe_wna16"
,
"slimquant_w4a8"
,
"w8a8_int8"
,
"awq_marlin"
,
"slimquant_w4a8_marlin"
,
"slimquant_compressed_tensors_marlin"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment