support v0.11.0 online int8/fp8 quantization

f1eb27b8 · zhaosong · zhangzbb · 49a30c70 · f1eb27b8 · f1eb27b8
Commit f1eb27b8 authored Apr 16, 2026 by zhaosong Committed by zhangzbb Apr 16, 2026
10 changed files
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -321,12 +321,12 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                             torch::Tensor const& scale);

-// void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
-//                               torch::Tensor& scale);
+void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor& scale);

-// void dynamic_per_token_scaled_fp8_quant(
-//     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
-//     std::optional<torch::Tensor> const& scale_ub);
+void dynamic_per_token_scaled_fp8_quant(
+    torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
+    std::optional<torch::Tensor> const& scale_ub);

 void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
                        const torch::Tensor& A, const torch::Tensor& B,

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -594,20 +594,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 //       "()");
 //   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);

-//   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
-//       "-> "
-//       "()");
-//   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
+      "-> "
+      "()");
+  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);

-//   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
-//       "Tensor! scale, Tensor? scale_ub) -> "
-//       "()");
-//   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
-//            &dynamic_per_token_scaled_fp8_quant);
+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! scale, Tensor? scale_ub) -> "
+      "()");
+  ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
+           &dynamic_per_token_scaled_fp8_quant);

  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
@@ -615,21 +615,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "()");
  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);

-//   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
-//       "-> "
-//       "()");
-//   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
-
-//   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
-//   ops.def(
-//       "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
-//       "Tensor! scale, Tensor? scale_ub) -> "
-//       "()");
-//   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
-//            &dynamic_per_token_scaled_fp8_quant);
-
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1419,9 +1419,10 @@ def scaled_fp8_quant(
            scale = torch.empty((shape[0], 1),
                                device=input.device,
                                dtype=torch.float32)
-            # torch.ops._C.dynamic_per_token_scaled_fp8_quant(
-            #     output, input.contiguous(), scale, scale_ub)
-            output, scale = per_token_quant_fp8(input.contiguous())
+            torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                output, input.contiguous(), scale, scale_ub)
+            # per_token_quant_fp8 has precision problem.
+            # output, scale = per_token_quant_fp8(input.contiguous())
        else:
            scale = torch.zeros(1, device=input.device, dtype=torch.float32)
            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -11,6 +11,7 @@ QuantizationMethods = Literal[
    "deepspeedfp",
    "tpu_int8",
    "fp8",
+    "dcu_int8",
    "ptpc_fp8",
    "fbgemm_fp8",
    "modelopt",
@@ -103,6 +104,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
    from .experts_int8 import ExpertsInt8Config
    from .fbgemm_fp8 import FBGEMMFp8Config
    from .fp8 import Fp8Config
+    from .dcu_int8 import DcuInt8Config
    from .gguf import GGUFConfig
    from .gptq import GPTQConfig
    from .gptq_bitblas import GPTQBitBLASConfig
@@ -128,6 +130,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
        "deepspeedfp": DeepSpeedFPConfig,
        "tpu_int8": Int8TpuConfig,
        "fp8": Fp8Config,
+        "dcu_int8": DcuInt8Config,
        "fbgemm_fp8": FBGEMMFp8Config,
        "modelopt": ModelOptFp8Config,
        "modelopt_fp4": ModelOptNvFp4Config,

--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
@@ -134,6 +134,7 @@ class BlockInt8LinearMethod(LinearMethodBase):

    def __init__(self, quant_config: BlockInt8Config):
        self.quant_config = quant_config
+        raise ValueError(vars(quant_config))
        self.tritonsingleton= W8a8GetCacheJSON()
        self.block_size=self.quant_config.weight_block_size
        

--- a/vllm/model_executor/layers/quantization/dcu_int8.py
+++ b/vllm/model_executor/layers/quantization/dcu_int8.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoE, FusedMoEActivationFormat, FusedMoEMethodBase,
+    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
+    FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig)
+from vllm.model_executor.layers.fused_moe.layer import (
+    UnquantizedFusedMoEMethod)
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,get_flashinfer_moe_backend,
+    register_moe_scaling_factors, swap_w13_to_w31)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_block_linear, check_aiter_fp8_linear_support,
+    create_fp8_input_scale, create_fp8_scale_parameter,
+    create_fp8_weight_parameter, expert_weight_is_col_major,
+    maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
+    process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace,
+    validate_fp8_block_shape)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin,
+    prepare_moe_fp8_layer_for_marlin)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported,
+    cutlass_fp8_supported, maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import (get_col_major_tma_aligned_tensor,
+                                  is_deep_gemm_e8m0_used,
+                                  is_deep_gemm_supported)
+from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import apply_int8_linear
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+class DcuInt8Config(QuantizationConfig):
+    """Config class for DcuInt8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[list[str]] = None,
+        weight_block_size: Optional[list[int]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "The block-wise quantization only supports fp8-serialized "
+                    "checkpoint for now.")
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    "The quantization block size of weight must have 2 "
+                    f"dimensions, but got {len(weight_block_size)} dimensions")
+            if activation_scheme != "dynamic":
+                raise ValueError("The block-wise quantization only supports "
+                                 "dynamic activation scheme for now, but got "
+                                 f"{activation_scheme} activation scheme.")
+        self.weight_block_size = weight_block_size
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.ignored_layers is not None:
+            self.ignored_layers = hf_to_vllm_mapper.apply_list(
+                self.ignored_layers)
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "DcuInt8Config":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
+                                                 None)
+        if not ignored_layers:
+            ignored_layers = cls.get_from_keys_or(config,
+                                                  ["modules_to_not_convert"],
+                                                  None)
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers,
+                   weight_block_size=weight_block_size)
+
+    def get_xpu_quant_method(self, layer: torch.nn.Module,
+                             prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention
+        from vllm.model_executor.layers.quantization.ipex_quant import (
+            XPUFp8LinearMethod, XPUFp8MoEMethod)
+        fp8_config = DcuInt8Config(
+            is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized,
+            activation_scheme=self.activation_scheme,
+            ignored_layers=self.ignored_layers,
+            weight_block_size=self.weight_block_size)
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return XPUFp8LinearMethod(fp8_config)
+        elif isinstance(layer, FusedMoE):
+            return XPUFp8MoEMethod(fp8_config, layer)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if current_platform.is_xpu():
+            return self.get_xpu_quant_method(layer, prefix)
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedLinearMethod()
+            return DcuInt8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+            return Fp8MoEMethod(self, layer)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
+        # If no matches, return None
+        return None
+
+
+class DcuInt8LinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: DcuInt8Config):
+        self.quant_config = quant_config
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.out_dtype = torch.get_default_dtype()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = (not current_platform.has_device_capability(89)
+                           or envs.VLLM_TEST_FORCE_FP8_MARLIN)
+        # Disable marlin for rocm
+        if current_platform.is_rocm():
+            self.use_marlin = False
+
+        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+
+        self.weight_block_size = self.quant_config.weight_block_size
+        self.block_quant = self.weight_block_size is not None
+        self.act_q_static = self.quant_config.activation_scheme == "static"
+        # Use per-token quantization for better perf if dynamic and cutlass
+        if not self.act_q_static and cutlass_fp8_supported():
+            self.act_q_group_shape = GroupShape.PER_TOKEN
+        else:
+            self.act_q_group_shape = GroupShape.PER_TENSOR
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=self.act_q_static,
+            act_quant_group_shape=self.act_q_group_shape)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        layer.weight_block_size = None
+
+        if self.block_quant:
+            assert self.weight_block_size is not None
+            layer.weight_block_size = self.weight_block_size
+            validate_fp8_block_shape(layer, input_size, output_size,
+                                     input_size_per_partition,
+                                     output_partition_sizes,
+                                     self.weight_block_size)
+
+        # WEIGHT
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            weight = create_fp8_weight_parameter(output_size_per_partition,
+                                                 input_size_per_partition,
+                                                 weight_loader)
+        else:
+            # For non-serialized checkpoints, use original dtype
+            weight = ModelWeightParameter(data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=params_dtype),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            if not self.block_quant:
+                scale = create_fp8_scale_parameter(PerTensorScaleParameter,
+                                                   output_partition_sizes,
+                                                   input_size_per_partition,
+                                                   None, weight_loader)
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
+                layer.register_parameter("weight_scale", scale)
+            else:
+                assert not self.act_q_static
+                assert self.weight_block_size is not None
+                scale = create_fp8_scale_parameter(BlockQuantScaleParameter,
+                                                   output_partition_sizes,
+                                                   input_size_per_partition,
+                                                   self.weight_block_size,
+                                                   weight_loader)
+                set_weight_attrs(scale, {"scale_type": "weight_scale"})
+                # The weight_scale_inv name is intentional for deepseekv3
+                layer.register_parameter("weight_scale_inv", scale)
+
+            # INPUT ACTIVATION SCALE
+            if self.act_q_static:
+                scale = create_fp8_input_scale(output_partition_sizes,
+                                               weight_loader)
+                set_weight_attrs(scale, {"scale_type": "input_scale"})
+                layer.register_parameter("input_scale", scale)
+            else:
+                layer.register_parameter("input_scale", None)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        size_k_first = True
+        input_scale = None
+        # TODO(rob): refactor block quant into separate class.
+        if self.block_quant:
+            assert not self.act_q_static
+            size_k_first = False
+
+            weight, weight_scale = process_fp8_weight_block_strategy(
+                layer.weight, layer.weight_scale_inv)
+            # Delete the weight_scale_inv parameter to avoid confusion
+            # with the weight_scale parameter
+            del layer.weight_scale_inv
+
+        # If checkpoint not serialized fp8, quantize the weights.
+        elif not self.quant_config.is_checkpoint_fp8_serialized:
+            # # 先计算 per-channel scale（每个 output channel 取绝对值最大值）
+            # weight_scale = layer.weight.abs().max(dim=1, keepdim=True).values / 127.0
+            # print("111111111111111111111111")
+            qweight, weight_scale, _ = ops.scaled_int8_quant(layer.weight,
+                                                         scale=None,)
+            # weight = qweight.t().contiguous()
+            weight = qweight.contiguous()
+
+        # If checkpoint is fp8 per-tensor, handle that there are N scales for N
+        # shards in a fused module
+        else:
+            weight = layer.weight
+            weight_scale = layer.weight_scale
+
+            # If using w8a8, torch._scaled_mm needs per tensor, so
+            # requantize the logical shards as a single weight.
+            if not self.use_marlin:
+                weight, weight_scale, input_scale = (
+                    process_fp8_weight_tensor_strategy(
+                        weight, weight_scale, layer.logical_widths,
+                        getattr(layer, 'input_scale', None)))
+                if self.act_q_static:
+                    assert input_scale is not None
+                    input_scale = input_scale.max()
+            weight = weight.t()
+
+        # Update layer with new values.
+        layer.weight = Parameter(weight.data, requires_grad=False)
+        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
+        layer.input_scale = Parameter(
+            input_scale,
+            requires_grad=False) if input_scale is not None else None
+
+        if self.use_marlin:
+            prepare_fp8_layer_for_marlin(layer, size_k_first)
+            # Activations not quantized for marlin.
+            del layer.input_scale
+            return
+
+        if self.block_quant:
+            maybe_post_process_fp8_weight_block(
+                layer, self.cutlass_block_fp8_supported)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # print("xdtype: ", x.dtype)
+        if self.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias)
+
+        if self.block_quant:
+            return apply_fp8_block_linear(
+                layer,
+                input=x,
+                bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported)
+        
+        return apply_int8_linear(input=x,
+                                weight=layer.weight,
+                                weight_scale=layer.weight_scale,
+                                input_scale=layer.input_scale,
+                                bias=bias,
+                                w8a8_strategy=3)
+
+        # return self.fp8_linear.apply(input=x,
+        #                              weight=layer.weight,
+        #                              weight_scale=layer.weight_scale,
+        #                              out_dtype=self.out_dtype,
+        #                              input_scale=layer.input_scale,
+        #                              bias=bias)
+
+
+
+class Fp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: DcuInt8Config):
+        super().__init__(quant_config)
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -79,6 +79,7 @@ class Fp8Config(QuantizationConfig):
        weight_block_size: Optional[list[int]] = None,
    ) -> None:
        super().__init__()
+        # raise ValueError(weight_block_size)

        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized


--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -39,10 +39,10 @@ class PTPCFp8Config(Fp8Config):
            raise ValueError(
                "ptpc_fp8 quantization is supported only on ROCm.")

-        if not current_platform.has_device_capability(94):
-            raise ValueError(
-                "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
-            )
+        # if not current_platform.has_device_capability(94):
+        #     raise ValueError(
+        #         "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
+        #     )
        if activation_scheme == "static":
            raise ValueError(
                "ptpc_fp8 as of now only support dynamic quantization.")
@@ -112,7 +112,7 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):

        # Update the layer with the new values.
        layer.weight = Parameter(
-            qweight.t(), requires_grad=False)  # Pretranspose the weight
+            qweight.contiguous(), requires_grad=False)  # Pretranspose the weight
        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
        layer.input_scale = None


--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -495,7 +495,7 @@ def apply_int8_linear(
    # ops.scaled_int8_quant supports both dynamic and static quant.
    # * dynamic, layer.input_scale is None and x_scale computed from x.
    # * static, layer.input_scale is scalar and x_scale is input_scale.
-
+    # print(1111)
    symmetric = azp_adj is None
    if input_scale is None and input_zero_point is None and symmetric is True:
        x_q, x_scale=per_token_quant_int8(input)

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -189,7 +189,7 @@ class RocmPlatform(Platform):

    supported_quantization: list[str] = [
        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
-        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao",
+        "quark", "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao", "dcu_int8",
        "moe_wna16", "slimquant_w4a8", "w8a8_int8", "awq_marlin", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin"
    ]